howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26
   27from howard.functions.commons import *
   28from howard.objects.database import *
   29from howard.functions.databases import *
   30from howard.functions.utils import *
   31
   32
   33class Variants:
   34
   35    def __init__(
   36        self,
   37        conn=None,
   38        input: str = None,
   39        output: str = None,
   40        config: dict = {},
   41        param: dict = {},
   42        load: bool = False,
   43    ) -> None:
   44        """
   45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   46        header
   47
   48        :param conn: the connection to the database
   49        :param input: the input file
   50        :param output: the output file
   51        :param config: a dictionary containing the configuration of the model
   52        :param param: a dictionary containing the parameters of the model
   53        """
   54
   55        # Init variables
   56        self.init_variables()
   57
   58        # Input
   59        self.set_input(input)
   60
   61        # Config
   62        self.set_config(config)
   63
   64        # Param
   65        self.set_param(param)
   66
   67        # Output
   68        self.set_output(output)
   69
   70        # connexion
   71        self.set_connexion(conn)
   72
   73        # Header
   74        self.set_header()
   75
   76        # Samples
   77        self.set_samples()
   78
   79        # Load data
   80        if load:
   81            self.load_data()
   82
   83    def set_samples(self, samples: list = None) -> list:
   84        """
   85        The function `set_samples` sets the samples attribute of an object to a provided list or
   86        retrieves it from a parameter dictionary.
   87
   88        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   89        input and sets the `samples` attribute of the class to the provided list. If no samples are
   90        provided, it tries to get the samples from the class's parameters using the `get_param` method
   91        :type samples: list
   92        :return: The `samples` list is being returned.
   93        """
   94
   95        if not samples:
   96            samples = self.get_param().get("samples", {}).get("list", None)
   97
   98        self.samples = samples
   99
  100        return samples
  101
  102    def get_samples(self) -> list:
  103        """
  104        This function returns a list of samples.
  105        :return: The `get_samples` method is returning the `samples` attribute of the object.
  106        """
  107
  108        return self.samples
  109
  110    def get_samples_check(self) -> bool:
  111        """
  112        This function returns the value of the "check" key within the "samples" dictionary retrieved
  113        from the parameters.
  114        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  115        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  116        method. If the key "check" is not found, it will return `False`.
  117        """
  118
  119        return self.get_param().get("samples", {}).get("check", True)
  120
  121    def set_input(self, input: str = None) -> None:
  122        """
  123        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  124        attributes in the class accordingly.
  125
  126        :param input: The `set_input` method in the provided code snippet is used to set attributes
  127        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  128        :type input: str
  129        """
  130
  131        if input and not isinstance(input, str):
  132            try:
  133                self.input = input.name
  134            except:
  135                log.error(f"Input file '{input} in bad format")
  136                raise ValueError(f"Input file '{input} in bad format")
  137        else:
  138            self.input = input
  139
  140        # Input format
  141        if input:
  142            input_name, input_extension = os.path.splitext(self.input)
  143            self.input_name = input_name
  144            self.input_extension = input_extension
  145            self.input_format = self.input_extension.replace(".", "")
  146
  147    def set_config(self, config: dict) -> None:
  148        """
  149        The set_config function takes a config object and assigns it as the configuration object for the
  150        class.
  151
  152        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  153        contains configuration settings for the class. When you call the `set_config` function with a
  154        dictionary object as the argument, it will set that dictionary as the configuration object for
  155        the class
  156        :type config: dict
  157        """
  158
  159        self.config = config
  160
  161    def set_param(self, param: dict) -> None:
  162        """
  163        This function sets a parameter object for the class based on the input dictionary.
  164
  165        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  166        as the `param` attribute of the class instance
  167        :type param: dict
  168        """
  169
  170        self.param = param
  171
  172    def init_variables(self) -> None:
  173        """
  174        This function initializes the variables that will be used in the rest of the class
  175        """
  176
  177        self.prefix = "howard"
  178        self.table_variants = "variants"
  179        self.dataframe = None
  180
  181        self.comparison_map = {
  182            "gt": ">",
  183            "gte": ">=",
  184            "lt": "<",
  185            "lte": "<=",
  186            "equals": "=",
  187            "contains": "SIMILAR TO",
  188        }
  189
  190        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  191
  192        self.code_type_map_to_sql = {
  193            "Integer": "INTEGER",
  194            "String": "VARCHAR",
  195            "Float": "FLOAT",
  196            "Flag": "VARCHAR",
  197        }
  198
  199        self.index_additionnal_fields = []
  200
  201    def get_indexing(self) -> bool:
  202        """
  203        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  204        returns False.
  205        :return: The value of the indexing parameter.
  206        """
  207
  208        return self.get_param().get("indexing", False)
  209
  210    def get_connexion_config(self) -> dict:
  211        """
  212        The function `get_connexion_config` returns a dictionary containing the configuration for a
  213        connection, including the number of threads and memory limit.
  214        :return: a dictionary containing the configuration for the Connexion library.
  215        """
  216
  217        # config
  218        config = self.get_config()
  219
  220        # Connexion config
  221        connexion_config = {}
  222        threads = self.get_threads()
  223
  224        # Threads
  225        if threads:
  226            connexion_config["threads"] = threads
  227
  228        # Memory
  229        # if config.get("memory", None):
  230        #     connexion_config["memory_limit"] = config.get("memory")
  231        if self.get_memory():
  232            connexion_config["memory_limit"] = self.get_memory()
  233
  234        # Temporary directory
  235        if config.get("tmp", None):
  236            connexion_config["temp_directory"] = config.get("tmp")
  237
  238        # Access
  239        if config.get("access", None):
  240            access = config.get("access")
  241            if access in ["RO"]:
  242                access = "READ_ONLY"
  243            elif access in ["RW"]:
  244                access = "READ_WRITE"
  245            connexion_db = self.get_connexion_db()
  246            if connexion_db in ":memory:":
  247                access = "READ_WRITE"
  248            connexion_config["access_mode"] = access
  249
  250        return connexion_config
  251
  252    def get_duckdb_settings(self) -> dict:
  253        """
  254        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  255        string.
  256        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  257        """
  258
  259        # config
  260        config = self.get_config()
  261
  262        # duckdb settings
  263        duckdb_settings_dict = {}
  264        if config.get("duckdb_settings", None):
  265            duckdb_settings = config.get("duckdb_settings")
  266            duckdb_settings = full_path(duckdb_settings)
  267            # duckdb setting is a file
  268            if os.path.exists(duckdb_settings):
  269                with open(duckdb_settings) as json_file:
  270                    duckdb_settings_dict = yaml.safe_load(json_file)
  271            # duckdb settings is a string
  272            else:
  273                duckdb_settings_dict = json.loads(duckdb_settings)
  274
  275        return duckdb_settings_dict
  276
  277    def set_connexion_db(self) -> str:
  278        """
  279        The function `set_connexion_db` returns the appropriate database connection string based on the
  280        input format and connection type.
  281        :return: the value of the variable `connexion_db`.
  282        """
  283
  284        # Default connexion db
  285        default_connexion_db = ":memory:"
  286
  287        # Find connexion db
  288        if self.get_input_format() in ["db", "duckdb"]:
  289            connexion_db = self.get_input()
  290        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  291            connexion_db = default_connexion_db
  292        elif self.get_connexion_type() in ["tmpfile"]:
  293            tmp_name = tempfile.mkdtemp(
  294                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  295            )
  296            connexion_db = f"{tmp_name}/tmp.db"
  297        elif self.get_connexion_type() != "":
  298            connexion_db = self.get_connexion_type()
  299        else:
  300            connexion_db = default_connexion_db
  301
  302        # Set connexion db
  303        self.connexion_db = connexion_db
  304
  305        return connexion_db
  306
  307    def set_connexion(self, conn) -> None:
  308        """
  309        The function `set_connexion` creates a connection to a database, with options for different
  310        database formats and settings.
  311
  312        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  313        database. If a connection is not provided, a new connection to an in-memory database is created.
  314        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  315        sqlite
  316        """
  317
  318        # Connexion db
  319        connexion_db = self.set_connexion_db()
  320
  321        # Connexion config
  322        connexion_config = self.get_connexion_config()
  323
  324        # Connexion format
  325        connexion_format = self.get_config().get("connexion_format", "duckdb")
  326        # Set connexion format
  327        self.connexion_format = connexion_format
  328
  329        # Connexion
  330        if not conn:
  331            if connexion_format in ["duckdb"]:
  332                conn = duckdb.connect(connexion_db, config=connexion_config)
  333                # duckDB settings
  334                duckdb_settings = self.get_duckdb_settings()
  335                if duckdb_settings:
  336                    for setting in duckdb_settings:
  337                        setting_value = duckdb_settings.get(setting)
  338                        if isinstance(setting_value, str):
  339                            setting_value = f"'{setting_value}'"
  340                        conn.execute(f"PRAGMA {setting}={setting_value};")
  341            elif connexion_format in ["sqlite"]:
  342                conn = sqlite3.connect(connexion_db)
  343
  344        # Set connexion
  345        self.conn = conn
  346
  347        # Log
  348        log.debug(f"connexion_format: {connexion_format}")
  349        log.debug(f"connexion_db: {connexion_db}")
  350        log.debug(f"connexion config: {connexion_config}")
  351        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  352
  353    def set_output(self, output: str = None) -> None:
  354        """
  355        The `set_output` function in Python sets the output file based on the input or a specified key
  356        in the config file, extracting the output name, extension, and format.
  357
  358        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  359        the output file. If the config file has an 'output' key, the method sets the output to the value
  360        of that key. If no output is provided, it sets the output to `None`
  361        :type output: str
  362        """
  363
  364        if output and not isinstance(output, str):
  365            self.output = output.name
  366        else:
  367            self.output = output
  368
  369        # Output format
  370        if self.output:
  371            output_name, output_extension = os.path.splitext(self.output)
  372            self.output_name = output_name
  373            self.output_extension = output_extension
  374            self.output_format = self.output_extension.replace(".", "")
  375        else:
  376            self.output_name = None
  377            self.output_extension = None
  378            self.output_format = None
  379
  380    def set_header(self) -> None:
  381        """
  382        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  383        """
  384
  385        input_file = self.get_input()
  386        default_header_list = [
  387            "##fileformat=VCFv4.2",
  388            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  389        ]
  390
  391        # Full path
  392        input_file = full_path(input_file)
  393
  394        if input_file:
  395
  396            input_format = self.get_input_format()
  397            input_compressed = self.get_input_compressed()
  398            config = self.get_config()
  399            header_list = default_header_list
  400            if input_format in [
  401                "vcf",
  402                "hdr",
  403                "tsv",
  404                "csv",
  405                "psv",
  406                "parquet",
  407                "db",
  408                "duckdb",
  409            ]:
  410                # header provided in param
  411                if config.get("header_file", None):
  412                    with open(config.get("header_file"), "rt") as f:
  413                        header_list = self.read_vcf_header(f)
  414                # within a vcf file format (header within input file itsself)
  415                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  416                    # within a compressed vcf file format (.vcf.gz)
  417                    if input_compressed:
  418                        with bgzf.open(input_file, "rt") as f:
  419                            header_list = self.read_vcf_header(f)
  420                    # within an uncompressed vcf file format (.vcf)
  421                    else:
  422                        with open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                # header provided in default external file .hdr
  425                elif os.path.exists((input_file + ".hdr")):
  426                    with open(input_file + ".hdr", "rt") as f:
  427                        header_list = self.read_vcf_header(f)
  428                else:
  429                    try:  # Try to get header info fields and file columns
  430
  431                        with tempfile.TemporaryDirectory() as tmpdir:
  432
  433                            # Create database
  434                            db_for_header = Database(database=input_file)
  435
  436                            # Get header columns for infos fields
  437                            db_header_from_columns = (
  438                                db_for_header.get_header_from_columns()
  439                            )
  440
  441                            # Get real columns in the file
  442                            db_header_columns = db_for_header.get_columns()
  443
  444                            # Write header file
  445                            header_file_tmp = os.path.join(tmpdir, "header")
  446                            f = open(header_file_tmp, "w")
  447                            vcf.Writer(f, db_header_from_columns)
  448                            f.close()
  449
  450                            # Replace #CHROM line with rel columns
  451                            header_list = db_for_header.read_header_file(
  452                                header_file=header_file_tmp
  453                            )
  454                            header_list[-1] = "\t".join(db_header_columns)
  455
  456                    except:
  457
  458                        log.warning(
  459                            f"No header for file {input_file}. Set as default VCF header"
  460                        )
  461                        header_list = default_header_list
  462
  463            else:  # try for unknown format ?
  464
  465                log.error(f"Input file format '{input_format}' not available")
  466                raise ValueError(f"Input file format '{input_format}' not available")
  467
  468            if not header_list:
  469                header_list = default_header_list
  470
  471            # header as list
  472            self.header_list = header_list
  473
  474            # header as VCF object
  475            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  476
  477        else:
  478
  479            self.header_list = None
  480            self.header_vcf = None
  481
  482    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  483        """
  484        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  485        DataFrame based on the connection format.
  486
  487        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  488        represents the SQL query you want to execute. This query will be used to fetch data from a
  489        database and convert it into a pandas DataFrame
  490        :type query: str
  491        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  492        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  493        function will only fetch up to that number of rows from the database query result. If no limit
  494        is specified,
  495        :type limit: int
  496        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  497        """
  498
  499        # Connexion format
  500        connexion_format = self.get_connexion_format()
  501
  502        # Limit in query
  503        if limit:
  504            pd.set_option("display.max_rows", limit)
  505            if connexion_format in ["duckdb"]:
  506                df = (
  507                    self.conn.execute(query)
  508                    .fetch_record_batch(limit)
  509                    .read_next_batch()
  510                    .to_pandas()
  511                )
  512            elif connexion_format in ["sqlite"]:
  513                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  514
  515        # Full query
  516        else:
  517            if connexion_format in ["duckdb"]:
  518                df = self.conn.execute(query).df()
  519            elif connexion_format in ["sqlite"]:
  520                df = pd.read_sql_query(query, self.conn)
  521
  522        return df
  523
  524    def get_overview(self) -> None:
  525        """
  526        The function prints the input, output, config, and dataframe of the current object
  527        """
  528        table_variants_from = self.get_table_variants(clause="from")
  529        sql_columns = self.get_header_columns_as_sql()
  530        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  531        df = self.get_query_to_df(sql_query_export)
  532        log.info(
  533            "Input:  "
  534            + str(self.get_input())
  535            + " ["
  536            + str(str(self.get_input_format()))
  537            + "]"
  538        )
  539        log.info(
  540            "Output: "
  541            + str(self.get_output())
  542            + " ["
  543            + str(str(self.get_output_format()))
  544            + "]"
  545        )
  546        log.info("Config: ")
  547        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  548            "\n"
  549        ):
  550            log.info("\t" + str(d))
  551        log.info("Param: ")
  552        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  553            "\n"
  554        ):
  555            log.info("\t" + str(d))
  556        log.info("Sample list: " + str(self.get_header_sample_list()))
  557        log.info("Dataframe: ")
  558        for d in str(df).split("\n"):
  559            log.info("\t" + str(d))
  560
  561        # garbage collector
  562        del df
  563        gc.collect()
  564
  565        return None
  566
  567    def get_stats(self) -> dict:
  568        """
  569        The `get_stats` function calculates and returns various statistics of the current object,
  570        including information about the input file, variants, samples, header fields, quality, and
  571        SNVs/InDels.
  572        :return: a dictionary containing various statistics of the current object. The dictionary has
  573        the following structure:
  574        """
  575
  576        # Log
  577        log.info(f"Stats Calculation...")
  578
  579        # table varaints
  580        table_variants_from = self.get_table_variants()
  581
  582        # stats dict
  583        stats = {"Infos": {}}
  584
  585        ### File
  586        input_file = self.get_input()
  587        stats["Infos"]["Input file"] = input_file
  588
  589        # Header
  590        header_infos = self.get_header().infos
  591        header_formats = self.get_header().formats
  592        header_infos_list = list(header_infos)
  593        header_formats_list = list(header_formats)
  594
  595        ### Variants
  596
  597        stats["Variants"] = {}
  598
  599        # Variants by chr
  600        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  601        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  602        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  603            by=["CHROM"], kind="quicksort"
  604        )
  605
  606        # Total number of variants
  607        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  608
  609        # Calculate percentage
  610        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  611            lambda x: (x / nb_of_variants)
  612        )
  613
  614        stats["Variants"]["Number of variants by chromosome"] = (
  615            nb_of_variants_by_chrom.to_dict(orient="index")
  616        )
  617
  618        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  619
  620        ### Samples
  621
  622        # Init
  623        samples = {}
  624        nb_of_samples = 0
  625
  626        # Check Samples
  627        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  628            log.debug(f"Check samples...")
  629            for sample in self.get_header_sample_list():
  630                sql_query_samples = f"""
  631                    SELECT  '{sample}' as sample,
  632                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  633                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  634                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  635                    FROM {table_variants_from}
  636                    WHERE (
  637                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  638                        AND
  639                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  640                      )
  641                    GROUP BY genotype
  642                    """
  643                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  644                sample_genotype_count = sql_query_genotype_df["count"].sum()
  645                if len(sql_query_genotype_df):
  646                    nb_of_samples += 1
  647                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  648                        sql_query_genotype_df.to_dict(orient="index")
  649                    )
  650
  651            stats["Samples"] = samples
  652            stats["Infos"]["Number of samples"] = nb_of_samples
  653
  654        # #
  655        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  656        #     stats["Infos"]["Number of samples"] = nb_of_samples
  657        # elif nb_of_samples:
  658        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  659
  660        ### INFO and FORMAT fields
  661        header_types_df = {}
  662        header_types_list = {
  663            "List of INFO fields": header_infos,
  664            "List of FORMAT fields": header_formats,
  665        }
  666        i = 0
  667        for header_type in header_types_list:
  668
  669            header_type_infos = header_types_list.get(header_type)
  670            header_infos_dict = {}
  671
  672            for info in header_type_infos:
  673
  674                i += 1
  675                header_infos_dict[i] = {}
  676
  677                # ID
  678                header_infos_dict[i]["id"] = info
  679
  680                # num
  681                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  682                if header_type_infos[info].num in genotype_map.keys():
  683                    header_infos_dict[i]["Number"] = genotype_map.get(
  684                        header_type_infos[info].num
  685                    )
  686                else:
  687                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  688
  689                # type
  690                if header_type_infos[info].type:
  691                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  692                else:
  693                    header_infos_dict[i]["Type"] = "."
  694
  695                # desc
  696                if header_type_infos[info].desc != None:
  697                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  698                else:
  699                    header_infos_dict[i]["Description"] = ""
  700
  701            if len(header_infos_dict):
  702                header_types_df[header_type] = pd.DataFrame.from_dict(
  703                    header_infos_dict, orient="index"
  704                ).to_dict(orient="index")
  705
  706        # Stats
  707        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  708        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  709        stats["Header"] = header_types_df
  710
  711        ### QUAL
  712        if "QUAL" in self.get_header_columns():
  713            sql_query_qual = f"""
  714                    SELECT
  715                        avg(CAST(QUAL AS INTEGER)) AS Average,
  716                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  717                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  718                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  719                        median(CAST(QUAL AS INTEGER)) AS Median,
  720                        variance(CAST(QUAL AS INTEGER)) AS Variance
  721                    FROM {table_variants_from}
  722                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  723                    """
  724
  725            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  726            stats["Quality"] = {"Stats": qual}
  727
  728        ### SNV and InDel
  729
  730        sql_query_snv = f"""
  731            
  732            SELECT Type, count FROM (
  733
  734                    SELECT
  735                        'Total' AS Type,
  736                        count(*) AS count
  737                    FROM {table_variants_from}
  738
  739                    UNION
  740
  741                    SELECT
  742                        'MNV' AS Type,
  743                        count(*) AS count
  744                    FROM {table_variants_from}
  745                    WHERE len(REF) > 1 AND len(ALT) > 1
  746                    AND len(REF) = len(ALT)
  747
  748                    UNION
  749
  750                    SELECT
  751                        'InDel' AS Type,
  752                        count(*) AS count
  753                    FROM {table_variants_from}
  754                    WHERE len(REF) > 1 OR len(ALT) > 1
  755                    AND len(REF) != len(ALT)
  756                    
  757                    UNION
  758
  759                    SELECT
  760                        'SNV' AS Type,
  761                        count(*) AS count
  762                    FROM {table_variants_from}
  763                    WHERE len(REF) = 1 AND len(ALT) = 1
  764
  765                )
  766
  767            ORDER BY count DESC
  768
  769                """
  770        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  771
  772        sql_query_snv_substitution = f"""
  773                SELECT
  774                    concat(REF, '>', ALT) AS 'Substitution',
  775                    count(*) AS count
  776                FROM {table_variants_from}
  777                WHERE len(REF) = 1 AND len(ALT) = 1
  778                GROUP BY REF, ALT
  779                ORDER BY count(*) DESC
  780                """
  781        snv_substitution = (
  782            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  783        )
  784        stats["Variants"]["Counts"] = snv_indel
  785        stats["Variants"]["Substitutions"] = snv_substitution
  786
  787        return stats
  788
  789    def stats_to_file(self, file: str = None) -> str:
  790        """
  791        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  792        into a JSON object, and writes the JSON object to the specified file.
  793
  794        :param file: The `file` parameter is a string that represents the file path where the JSON data
  795        will be written
  796        :type file: str
  797        :return: the name of the file that was written to.
  798        """
  799
  800        # Get stats
  801        stats = self.get_stats()
  802
  803        # Serializing json
  804        json_object = json.dumps(stats, indent=4)
  805
  806        # Writing to sample.json
  807        with open(file, "w") as outfile:
  808            outfile.write(json_object)
  809
  810        return file
  811
  812    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  813        """
  814        The `print_stats` function generates a markdown file and prints the statistics contained in a
  815        JSON file in a formatted manner.
  816
  817        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  818        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  819        provided, a temporary directory will be created and the stats will be saved in a file named
  820        "stats.md" within that
  821        :type output_file: str
  822        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  823        file where the statistics will be saved. If no value is provided, a temporary directory will be
  824        created and a default file name "stats.json" will be used
  825        :type json_file: str
  826        :return: The function `print_stats` does not return any value. It has a return type annotation
  827        of `None`.
  828        """
  829
  830        # Full path
  831        output_file = full_path(output_file)
  832        json_file = full_path(json_file)
  833
  834        with tempfile.TemporaryDirectory() as tmpdir:
  835
  836            # Files
  837            if not output_file:
  838                output_file = os.path.join(tmpdir, "stats.md")
  839            if not json_file:
  840                json_file = os.path.join(tmpdir, "stats.json")
  841
  842            # Create folders
  843            if not os.path.exists(os.path.dirname(output_file)):
  844                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  845            if not os.path.exists(os.path.dirname(json_file)):
  846                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  847
  848            # Create stats JSON file
  849            stats_file = self.stats_to_file(file=json_file)
  850
  851            # Print stats file
  852            with open(stats_file) as f:
  853                stats = yaml.safe_load(f)
  854
  855            # Output
  856            output_title = []
  857            output_index = []
  858            output = []
  859
  860            # Title
  861            output_title.append("# HOWARD Stats")
  862
  863            # Index
  864            output_index.append("## Index")
  865
  866            # Process sections
  867            for section in stats:
  868                infos = stats.get(section)
  869                section_link = "#" + section.lower().replace(" ", "-")
  870                output.append(f"## {section}")
  871                output_index.append(f"- [{section}]({section_link})")
  872
  873                if len(infos):
  874                    for info in infos:
  875                        try:
  876                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  877                            is_df = True
  878                        except:
  879                            try:
  880                                df = pd.DataFrame.from_dict(
  881                                    json.loads((infos.get(info))), orient="index"
  882                                )
  883                                is_df = True
  884                            except:
  885                                is_df = False
  886                        if is_df:
  887                            output.append(f"### {info}")
  888                            info_link = "#" + info.lower().replace(" ", "-")
  889                            output_index.append(f"   - [{info}]({info_link})")
  890                            output.append(f"{df.to_markdown(index=False)}")
  891                        else:
  892                            output.append(f"- {info}: {infos.get(info)}")
  893                else:
  894                    output.append(f"NA")
  895
  896            # Write stats in markdown file
  897            with open(output_file, "w") as fp:
  898                for item in output_title:
  899                    fp.write("%s\n" % item)
  900                for item in output_index:
  901                    fp.write("%s\n" % item)
  902                for item in output:
  903                    fp.write("%s\n" % item)
  904
  905            # Output stats in markdown
  906            print("")
  907            print("\n\n".join(output_title))
  908            print("")
  909            print("\n\n".join(output))
  910            print("")
  911
  912        return None
  913
  914    def get_input(self) -> str:
  915        """
  916        It returns the value of the input variable.
  917        :return: The input is being returned.
  918        """
  919        return self.input
  920
  921    def get_input_format(self, input_file: str = None) -> str:
  922        """
  923        This function returns the format of the input variable, either from the provided input file or
  924        by prompting for input.
  925
  926        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  927        represents the file path of the input file. If no `input_file` is provided when calling the
  928        method, it will default to `None`
  929        :type input_file: str
  930        :return: The format of the input variable is being returned.
  931        """
  932
  933        if not input_file:
  934            input_file = self.get_input()
  935        input_format = get_file_format(input_file)
  936        return input_format
  937
  938    def get_input_compressed(self, input_file: str = None) -> str:
  939        """
  940        The function `get_input_compressed` returns the format of the input variable after compressing
  941        it.
  942
  943        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  944        that represents the file path of the input file. If no `input_file` is provided when calling the
  945        method, it will default to `None` and the method will then call `self.get_input()` to
  946        :type input_file: str
  947        :return: The function `get_input_compressed` returns the compressed format of the input
  948        variable.
  949        """
  950
  951        if not input_file:
  952            input_file = self.get_input()
  953        input_compressed = get_file_compressed(input_file)
  954        return input_compressed
  955
  956    def get_output(self) -> str:
  957        """
  958        It returns the output of the neuron.
  959        :return: The output of the neural network.
  960        """
  961
  962        return self.output
  963
  964    def get_output_format(self, output_file: str = None) -> str:
  965        """
  966        The function `get_output_format` returns the format of the input variable or the output file if
  967        provided.
  968
  969        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  970        that represents the file path of the output file. If no `output_file` is provided when calling
  971        the method, it will default to the output obtained from the `get_output` method of the class
  972        instance. The
  973        :type output_file: str
  974        :return: The format of the input variable is being returned.
  975        """
  976
  977        if not output_file:
  978            output_file = self.get_output()
  979        output_format = get_file_format(output_file)
  980
  981        return output_format
  982
  983    def get_config(self) -> dict:
  984        """
  985        It returns the config
  986        :return: The config variable is being returned.
  987        """
  988        return self.config
  989
  990    def get_param(self) -> dict:
  991        """
  992        It returns the param
  993        :return: The param variable is being returned.
  994        """
  995        return self.param
  996
  997    def get_connexion_db(self) -> str:
  998        """
  999        It returns the connexion_db attribute of the object
 1000        :return: The connexion_db is being returned.
 1001        """
 1002        return self.connexion_db
 1003
 1004    def get_prefix(self) -> str:
 1005        """
 1006        It returns the prefix of the object.
 1007        :return: The prefix is being returned.
 1008        """
 1009        return self.prefix
 1010
 1011    def get_table_variants(self, clause: str = "select") -> str:
 1012        """
 1013        This function returns the table_variants attribute of the object
 1014
 1015        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1016        defaults to select (optional)
 1017        :return: The table_variants attribute of the object.
 1018        """
 1019
 1020        # Access
 1021        access = self.get_config().get("access", None)
 1022
 1023        # Clauses "select", "where", "update"
 1024        if clause in ["select", "where", "update"]:
 1025            table_variants = self.table_variants
 1026        # Clause "from"
 1027        elif clause in ["from"]:
 1028            # For Read Only
 1029            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1030                input_file = self.get_input()
 1031                table_variants = f"'{input_file}' as variants"
 1032            # For Read Write
 1033            else:
 1034                table_variants = f"{self.table_variants} as variants"
 1035        else:
 1036            table_variants = self.table_variants
 1037        return table_variants
 1038
 1039    def get_tmp_dir(self) -> str:
 1040        """
 1041        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1042        parameters or a default path.
 1043        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1044        configuration, parameters, and a default value of "/tmp".
 1045        """
 1046
 1047        return get_tmp(
 1048            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1049        )
 1050
 1051    def get_connexion_type(self) -> str:
 1052        """
 1053        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1054
 1055        :return: The connexion type is being returned.
 1056        """
 1057        return self.get_config().get("connexion_type", "memory")
 1058
 1059    def get_connexion(self):
 1060        """
 1061        It returns the connection object
 1062
 1063        :return: The connection object.
 1064        """
 1065        return self.conn
 1066
 1067    def close_connexion(self) -> None:
 1068        """
 1069        This function closes the connection to the database.
 1070        :return: The connection is being closed.
 1071        """
 1072        return self.conn.close()
 1073
 1074    def get_header(self, type: str = "vcf"):
 1075        """
 1076        This function returns the header of the VCF file as a list of strings
 1077
 1078        :param type: the type of header you want to get, defaults to vcf (optional)
 1079        :return: The header of the vcf file.
 1080        """
 1081
 1082        if self.header_vcf:
 1083            if type == "vcf":
 1084                return self.header_vcf
 1085            elif type == "list":
 1086                return self.header_list
 1087        else:
 1088            if type == "vcf":
 1089                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1090                return header
 1091            elif type == "list":
 1092                return vcf_required
 1093
 1094    def get_header_length(self, file: str = None) -> int:
 1095        """
 1096        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1097        line.
 1098
 1099        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1100        header file. If this argument is provided, the function will read the header from the specified
 1101        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1102        :type file: str
 1103        :return: the length of the header list, excluding the #CHROM line.
 1104        """
 1105
 1106        if file:
 1107            return len(self.read_vcf_header_file(file=file)) - 1
 1108        elif self.get_header(type="list"):
 1109            return len(self.get_header(type="list")) - 1
 1110        else:
 1111            return 0
 1112
 1113    def get_header_columns(self) -> str:
 1114        """
 1115        This function returns the header list of a VCF
 1116
 1117        :return: The length of the header list.
 1118        """
 1119        if self.get_header():
 1120            return self.get_header(type="list")[-1]
 1121        else:
 1122            return ""
 1123
 1124    def get_header_columns_as_list(self) -> list:
 1125        """
 1126        This function returns the header list of a VCF
 1127
 1128        :return: The length of the header list.
 1129        """
 1130        if self.get_header():
 1131            return self.get_header_columns().strip().split("\t")
 1132        else:
 1133            return []
 1134
 1135    def get_header_columns_as_sql(self) -> str:
 1136        """
 1137        This function retruns header length (without #CHROM line)
 1138
 1139        :return: The length of the header list.
 1140        """
 1141        sql_column_list = []
 1142        for col in self.get_header_columns_as_list():
 1143            sql_column_list.append(f'"{col}"')
 1144        return ",".join(sql_column_list)
 1145
 1146    def get_header_sample_list(
 1147        self, check: bool = False, samples: list = None, samples_force: bool = False
 1148    ) -> list:
 1149        """
 1150        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1151        checking and filtering based on input parameters.
 1152
 1153        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1154        parameter that determines whether to check if the samples in the list are properly defined as
 1155        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1156        list is defined as a, defaults to False
 1157        :type check: bool (optional)
 1158        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1159        allows you to specify a subset of samples from the header. If you provide a list of sample
 1160        names, the function will check if each sample is defined in the header. If a sample is not found
 1161        in the
 1162        :type samples: list
 1163        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1164        a boolean parameter that determines whether to force the function to return the sample list
 1165        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1166        function will return the sample list without performing, defaults to False
 1167        :type samples_force: bool (optional)
 1168        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1169        parameters and conditions specified in the function.
 1170        """
 1171
 1172        # Init
 1173        samples_list = []
 1174
 1175        if samples is None:
 1176            samples_list = self.header_vcf.samples
 1177        else:
 1178            samples_checked = []
 1179            for sample in samples:
 1180                if sample in self.header_vcf.samples:
 1181                    samples_checked.append(sample)
 1182                else:
 1183                    log.warning(f"Sample '{sample}' not defined in header")
 1184            samples_list = samples_checked
 1185
 1186            # Force sample list without checking if is_genotype_column
 1187            if samples_force:
 1188                log.warning(f"Samples {samples_list} not checked if genotypes")
 1189                return samples_list
 1190
 1191        if check:
 1192            samples_checked = []
 1193            for sample in samples_list:
 1194                if self.is_genotype_column(column=sample):
 1195                    samples_checked.append(sample)
 1196                else:
 1197                    log.warning(
 1198                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1199                    )
 1200            samples_list = samples_checked
 1201
 1202        # Return samples list
 1203        return samples_list
 1204
 1205    def is_genotype_column(self, column: str = None) -> bool:
 1206        """
 1207        This function checks if a given column is a genotype column in a database.
 1208
 1209        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1210        represents the column name in a database table. This method checks if the specified column is a
 1211        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1212        method of
 1213        :type column: str
 1214        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1215        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1216        column name and returns the result. If the `column` parameter is None, it returns False.
 1217        """
 1218
 1219        if column is not None:
 1220            return Database(database=self.get_input()).is_genotype_column(column=column)
 1221        else:
 1222            return False
 1223
 1224    def get_verbose(self) -> bool:
 1225        """
 1226        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1227        exist
 1228
 1229        :return: The value of the key "verbose" in the config dictionary.
 1230        """
 1231        return self.get_config().get("verbose", False)
 1232
 1233    def get_connexion_format(self) -> str:
 1234        """
 1235        It returns the connexion format of the object.
 1236        :return: The connexion_format is being returned.
 1237        """
 1238        connexion_format = self.connexion_format
 1239        if connexion_format not in ["duckdb", "sqlite"]:
 1240            log.error(f"Unknown connexion format {connexion_format}")
 1241            raise ValueError(f"Unknown connexion format {connexion_format}")
 1242        else:
 1243            return connexion_format
 1244
 1245    def insert_file_to_table(
 1246        self,
 1247        file,
 1248        columns: str,
 1249        header_len: int = 0,
 1250        sep: str = "\t",
 1251        chunksize: int = 1000000,
 1252    ) -> None:
 1253        """
 1254        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1255        database format.
 1256
 1257        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1258        the path to the file on your system
 1259        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1260        should contain the names of the columns in the table where the data will be inserted. The column
 1261        names should be separated by commas within the string. For example, if you have columns named
 1262        "id", "name
 1263        :type columns: str
 1264        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1265        the number of lines to skip at the beginning of the file before reading the actual data. This
 1266        parameter allows you to skip any header information present in the file before processing the
 1267        data, defaults to 0
 1268        :type header_len: int (optional)
 1269        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1270        separator character that is used in the file being read. In this case, the default separator is
 1271        set to `\t`, which represents a tab character. You can change this parameter to a different
 1272        separator character if, defaults to \t
 1273        :type sep: str (optional)
 1274        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1275        when processing the file in chunks. In the provided code snippet, the default value for
 1276        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1277        to 1000000
 1278        :type chunksize: int (optional)
 1279        """
 1280
 1281        # Config
 1282        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1283        connexion_format = self.get_connexion_format()
 1284
 1285        log.debug("chunksize: " + str(chunksize))
 1286
 1287        if chunksize:
 1288            for chunk in pd.read_csv(
 1289                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1290            ):
 1291                if connexion_format in ["duckdb"]:
 1292                    sql_insert_into = (
 1293                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1294                    )
 1295                    self.conn.execute(sql_insert_into)
 1296                elif connexion_format in ["sqlite"]:
 1297                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1298
 1299    def load_data(
 1300        self,
 1301        input_file: str = None,
 1302        drop_variants_table: bool = False,
 1303        sample_size: int = 20480,
 1304    ) -> None:
 1305        """
 1306        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1307        table before loading the data and specify a sample size.
 1308
 1309        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1310        table
 1311        :type input_file: str
 1312        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1313        determines whether the variants table should be dropped before loading the data. If set to
 1314        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1315        not be dropped, defaults to False
 1316        :type drop_variants_table: bool (optional)
 1317        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1318        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1319        20480
 1320        :type sample_size: int (optional)
 1321        """
 1322
 1323        log.info("Loading...")
 1324
 1325        # change input file
 1326        if input_file:
 1327            self.set_input(input_file)
 1328            self.set_header()
 1329
 1330        # drop variants table
 1331        if drop_variants_table:
 1332            self.drop_variants_table()
 1333
 1334        # get table variants
 1335        table_variants = self.get_table_variants()
 1336
 1337        # Access
 1338        access = self.get_config().get("access", None)
 1339        log.debug(f"access: {access}")
 1340
 1341        # Input format and compress
 1342        input_format = self.get_input_format()
 1343        input_compressed = self.get_input_compressed()
 1344        log.debug(f"input_format: {input_format}")
 1345        log.debug(f"input_compressed: {input_compressed}")
 1346
 1347        # input_compressed_format
 1348        if input_compressed:
 1349            input_compressed_format = "gzip"
 1350        else:
 1351            input_compressed_format = "none"
 1352        log.debug(f"input_compressed_format: {input_compressed_format}")
 1353
 1354        # Connexion format
 1355        connexion_format = self.get_connexion_format()
 1356
 1357        # Sample size
 1358        if not sample_size:
 1359            sample_size = -1
 1360        log.debug(f"sample_size: {sample_size}")
 1361
 1362        # Load data
 1363        log.debug(f"Load Data from {input_format}")
 1364
 1365        # DuckDB connexion
 1366        if connexion_format in ["duckdb"]:
 1367
 1368            # Database already exists
 1369            if self.input_format in ["db", "duckdb"]:
 1370
 1371                if connexion_format in ["duckdb"]:
 1372                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1373                else:
 1374                    log.error(
 1375                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1376                    )
 1377                    raise ValueError(
 1378                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1379                    )
 1380
 1381            # Load from existing database format
 1382            else:
 1383
 1384                try:
 1385                    # Create Table or View
 1386                    database = Database(database=self.input)
 1387                    sql_from = database.get_sql_from(sample_size=sample_size)
 1388
 1389                    if access in ["RO"]:
 1390                        sql_load = (
 1391                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1392                        )
 1393                    else:
 1394                        sql_load = (
 1395                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1396                        )
 1397                    self.conn.execute(sql_load)
 1398
 1399                except:
 1400                    # Format not available
 1401                    log.error(f"Input file format '{self.input_format}' not available")
 1402                    raise ValueError(
 1403                        f"Input file format '{self.input_format}' not available"
 1404                    )
 1405
 1406        # SQLite connexion
 1407        elif connexion_format in ["sqlite"] and input_format in [
 1408            "vcf",
 1409            "tsv",
 1410            "csv",
 1411            "psv",
 1412        ]:
 1413
 1414            # Main structure
 1415            structure = {
 1416                "#CHROM": "VARCHAR",
 1417                "POS": "INTEGER",
 1418                "ID": "VARCHAR",
 1419                "REF": "VARCHAR",
 1420                "ALT": "VARCHAR",
 1421                "QUAL": "VARCHAR",
 1422                "FILTER": "VARCHAR",
 1423                "INFO": "VARCHAR",
 1424            }
 1425
 1426            # Strcuture with samples
 1427            structure_complete = structure
 1428            if self.get_header_sample_list():
 1429                structure["FORMAT"] = "VARCHAR"
 1430                for sample in self.get_header_sample_list():
 1431                    structure_complete[sample] = "VARCHAR"
 1432
 1433            # Columns list for create and insert
 1434            sql_create_table_columns = []
 1435            sql_create_table_columns_list = []
 1436            for column in structure_complete:
 1437                column_type = structure_complete[column]
 1438                sql_create_table_columns.append(
 1439                    f'"{column}" {column_type} default NULL'
 1440                )
 1441                sql_create_table_columns_list.append(f'"{column}"')
 1442
 1443            # Create database
 1444            log.debug(f"Create Table {table_variants}")
 1445            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1446            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1447            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1448            self.conn.execute(sql_create_table)
 1449
 1450            # chunksize define length of file chunk load file
 1451            chunksize = 100000
 1452
 1453            # delimiter
 1454            delimiter = file_format_delimiters.get(input_format, "\t")
 1455
 1456            # Load the input file
 1457            with open(self.input, "rt") as input_file:
 1458
 1459                # Use the appropriate file handler based on the input format
 1460                if input_compressed:
 1461                    input_file = bgzf.open(self.input, "rt")
 1462                if input_format in ["vcf"]:
 1463                    header_len = self.get_header_length()
 1464                else:
 1465                    header_len = 0
 1466
 1467                # Insert the file contents into a table
 1468                self.insert_file_to_table(
 1469                    input_file,
 1470                    columns=sql_create_table_columns_list_sql,
 1471                    header_len=header_len,
 1472                    sep=delimiter,
 1473                    chunksize=chunksize,
 1474                )
 1475
 1476        else:
 1477            log.error(
 1478                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1479            )
 1480            raise ValueError(
 1481                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1482            )
 1483
 1484        # Explode INFOS fields into table fields
 1485        if self.get_explode_infos():
 1486            self.explode_infos(
 1487                prefix=self.get_explode_infos_prefix(),
 1488                fields=self.get_explode_infos_fields(),
 1489                force=True,
 1490            )
 1491
 1492        # Create index after insertion
 1493        self.create_indexes()
 1494
 1495    def get_explode_infos(self) -> bool:
 1496        """
 1497        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1498        to False if it is not set.
 1499        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1500        value. If the parameter is not present, it will return False.
 1501        """
 1502
 1503        return self.get_param().get("explode", {}).get("explode_infos", False)
 1504
 1505    def get_explode_infos_fields(
 1506        self,
 1507        explode_infos_fields: str = None,
 1508        remove_fields_not_in_header: bool = False,
 1509    ) -> list:
 1510        """
 1511        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1512        the input parameter `explode_infos_fields`.
 1513
 1514        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1515        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1516        comma-separated list of field names to explode
 1517        :type explode_infos_fields: str
 1518        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1519        flag that determines whether to remove fields that are not present in the header. If it is set
 1520        to `True`, any field that is not in the header will be excluded from the list of exploded
 1521        information fields. If it is set to `, defaults to False
 1522        :type remove_fields_not_in_header: bool (optional)
 1523        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1524        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1525        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1526        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1527        splitting the string by commas.
 1528        """
 1529
 1530        # If no fields, get it in param
 1531        if not explode_infos_fields:
 1532            explode_infos_fields = (
 1533                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1534            )
 1535
 1536        # If no fields, defined as all fields in header using keyword
 1537        if not explode_infos_fields:
 1538            explode_infos_fields = "*"
 1539
 1540        # If fields list not empty
 1541        if explode_infos_fields:
 1542
 1543            # Input fields list
 1544            if isinstance(explode_infos_fields, str):
 1545                fields_input = explode_infos_fields.split(",")
 1546            elif isinstance(explode_infos_fields, list):
 1547                fields_input = explode_infos_fields
 1548            else:
 1549                fields_input = []
 1550
 1551            # Fields list without * keyword
 1552            fields_without_all = fields_input.copy()
 1553            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1554                fields_without_all.remove("*")
 1555
 1556            # Fields in header
 1557            fields_in_header = sorted(list(set(self.get_header().infos)))
 1558
 1559            # Construct list of fields
 1560            fields_output = []
 1561            for field in fields_input:
 1562
 1563                # Strip field
 1564                field = field.strip()
 1565
 1566                # format keyword * in regex
 1567                if field.upper() in ["*"]:
 1568                    field = ".*"
 1569
 1570                # Find all fields with pattern
 1571                r = re.compile(field)
 1572                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1573
 1574                # Remove fields input from search
 1575                if field in fields_search:
 1576                    fields_search = [field]
 1577                elif fields_search != [field]:
 1578                    fields_search = sorted(
 1579                        list(set(fields_search).difference(fields_input))
 1580                    )
 1581
 1582                # If field is not in header (avoid not well formatted header)
 1583                if not fields_search and not remove_fields_not_in_header:
 1584                    fields_search = [field]
 1585
 1586                # Add found fields
 1587                for new_field in fields_search:
 1588                    # Add field, if not already exists, and if it is in header (if asked)
 1589                    if (
 1590                        new_field not in fields_output
 1591                        and (
 1592                            not remove_fields_not_in_header
 1593                            or new_field in fields_in_header
 1594                        )
 1595                        and new_field not in [".*"]
 1596                    ):
 1597                        fields_output.append(new_field)
 1598
 1599            return fields_output
 1600
 1601        else:
 1602
 1603            return []
 1604
 1605    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1606        """
 1607        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1608        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1609        not provided.
 1610
 1611        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1612        prefix to be used for exploding or expanding information
 1613        :type explode_infos_prefix: str
 1614        :return: the value of the variable `explode_infos_prefix`.
 1615        """
 1616
 1617        if not explode_infos_prefix:
 1618            explode_infos_prefix = (
 1619                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1620            )
 1621
 1622        return explode_infos_prefix
 1623
 1624    def add_column(
 1625        self,
 1626        table_name,
 1627        column_name,
 1628        column_type,
 1629        default_value=None,
 1630        drop: bool = False,
 1631    ) -> dict:
 1632        """
 1633        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1634        doesn't already exist.
 1635
 1636        :param table_name: The name of the table to which you want to add a column
 1637        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1638        to the table
 1639        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1640        want to add to the table. It should be a string that represents the desired data type, such as
 1641        "INTEGER", "TEXT", "REAL", etc
 1642        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1643        default value for the newly added column. If a default value is provided, it will be assigned to
 1644        the column for any existing rows that do not have a value for that column
 1645        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1646        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1647        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1648        to False
 1649        :type drop: bool (optional)
 1650        :return: a boolean value indicating whether the column was successfully added to the table.
 1651        """
 1652
 1653        # added
 1654        added = False
 1655        dropped = False
 1656
 1657        # Check if the column already exists in the table
 1658        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1659        columns = self.get_query_to_df(query).columns.tolist()
 1660        if column_name.upper() in [c.upper() for c in columns]:
 1661            log.debug(
 1662                f"The {column_name} column already exists in the {table_name} table"
 1663            )
 1664            if drop:
 1665                self.drop_column(table_name=table_name, column_name=column_name)
 1666                dropped = True
 1667            else:
 1668                return None
 1669        else:
 1670            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1671
 1672        # Add column in table
 1673        add_column_query = (
 1674            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1675        )
 1676        if default_value is not None:
 1677            add_column_query += f" DEFAULT {default_value}"
 1678        self.execute_query(add_column_query)
 1679        added = not dropped
 1680        log.debug(
 1681            f"The {column_name} column was successfully added to the {table_name} table"
 1682        )
 1683
 1684        if added:
 1685            added_column = {
 1686                "table_name": table_name,
 1687                "column_name": column_name,
 1688                "column_type": column_type,
 1689                "default_value": default_value,
 1690            }
 1691        else:
 1692            added_column = None
 1693
 1694        return added_column
 1695
 1696    def drop_column(
 1697        self, column: dict = None, table_name: str = None, column_name: str = None
 1698    ) -> bool:
 1699        """
 1700        The `drop_column` function drops a specified column from a given table in a database and returns
 1701        True if the column was successfully dropped, and False if the column does not exist in the
 1702        table.
 1703
 1704        :param column: The `column` parameter is a dictionary that contains information about the column
 1705        you want to drop. It has two keys:
 1706        :type column: dict
 1707        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1708        drop a column
 1709        :type table_name: str
 1710        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1711        from the table
 1712        :type column_name: str
 1713        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1714        and False if the column does not exist in the table.
 1715        """
 1716
 1717        # Find column infos
 1718        if column:
 1719            if isinstance(column, dict):
 1720                table_name = column.get("table_name", None)
 1721                column_name = column.get("column_name", None)
 1722            elif isinstance(column, str):
 1723                table_name = self.get_table_variants()
 1724                column_name = column
 1725            else:
 1726                table_name = None
 1727                column_name = None
 1728
 1729        if not table_name and not column_name:
 1730            return False
 1731
 1732        # Removed
 1733        removed = False
 1734
 1735        # Check if the column already exists in the table
 1736        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1737        columns = self.get_query_to_df(query).columns.tolist()
 1738        if column_name in columns:
 1739            log.debug(f"The {column_name} column exists in the {table_name} table")
 1740        else:
 1741            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1742            return False
 1743
 1744        # Add column in table # ALTER TABLE integers DROP k
 1745        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1746        self.execute_query(add_column_query)
 1747        removed = True
 1748        log.debug(
 1749            f"The {column_name} column was successfully dropped to the {table_name} table"
 1750        )
 1751
 1752        return removed
 1753
 1754    def explode_infos(
 1755        self,
 1756        prefix: str = None,
 1757        create_index: bool = False,
 1758        fields: list = None,
 1759        force: bool = False,
 1760        proccess_all_fields_together: bool = False,
 1761        table: str = None,
 1762    ) -> list:
 1763        """
 1764        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1765        individual columns, returning a list of added columns.
 1766
 1767        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1768        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1769        `self.get_explode_infos_prefix()` as the prefix
 1770        :type prefix: str
 1771        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1772        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1773        `False`, indexes will not be created. The default value is `False`, defaults to False
 1774        :type create_index: bool (optional)
 1775        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1776        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1777        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1778        a list to the `
 1779        :type fields: list
 1780        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1781        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1782        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1783        defaults to False
 1784        :type force: bool (optional)
 1785        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1786        flag that determines whether to process all the INFO fields together or individually. If set to
 1787        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1788        be processed individually. The default value is, defaults to False
 1789        :type proccess_all_fields_together: bool (optional)
 1790        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1791        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1792        a value for the `table` parameter, the function will use that table name. If the `table`
 1793        parameter is
 1794        :type table: str
 1795        :return: The `explode_infos` function returns a list of added columns.
 1796        """
 1797
 1798        # drop indexes
 1799        self.drop_indexes()
 1800
 1801        # connexion format
 1802        connexion_format = self.get_connexion_format()
 1803
 1804        # Access
 1805        access = self.get_config().get("access", None)
 1806
 1807        # Added columns
 1808        added_columns = []
 1809
 1810        if access not in ["RO"]:
 1811
 1812            # prefix
 1813            if prefix in [None, True] or not isinstance(prefix, str):
 1814                if self.get_explode_infos_prefix() not in [None, True]:
 1815                    prefix = self.get_explode_infos_prefix()
 1816                else:
 1817                    prefix = "INFO/"
 1818
 1819            # table variants
 1820            if table is not None:
 1821                table_variants = table
 1822            else:
 1823                table_variants = self.get_table_variants(clause="select")
 1824
 1825            # extra infos
 1826            try:
 1827                extra_infos = self.get_extra_infos()
 1828            except:
 1829                extra_infos = []
 1830
 1831            # Header infos
 1832            header_infos = self.get_header().infos
 1833
 1834            log.debug(
 1835                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1836            )
 1837
 1838            sql_info_alter_table_array = []
 1839
 1840            # Info fields to check
 1841            fields_list = list(header_infos)
 1842            if fields:
 1843                fields_list += fields
 1844            fields_list = set(fields_list)
 1845
 1846            # If no fields
 1847            if not fields:
 1848                fields = []
 1849
 1850            # Translate fields if patterns
 1851            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1852
 1853            for info in fields:
 1854
 1855                info_id_sql = prefix + info
 1856
 1857                if (
 1858                    info in fields_list
 1859                    or prefix + info in fields_list
 1860                    or info in extra_infos
 1861                ):
 1862
 1863                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1864
 1865                    if info in header_infos:
 1866                        info_type = header_infos[info].type
 1867                        info_num = header_infos[info].num
 1868                    else:
 1869                        info_type = "String"
 1870                        info_num = 0
 1871
 1872                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1873                    if info_num != 1:
 1874                        type_sql = "VARCHAR"
 1875
 1876                    # Add field
 1877                    added_column = self.add_column(
 1878                        table_name=table_variants,
 1879                        column_name=info_id_sql,
 1880                        column_type=type_sql,
 1881                        default_value="null",
 1882                        drop=force,
 1883                    )
 1884
 1885                    if added_column:
 1886                        added_columns.append(added_column)
 1887
 1888                    if added_column or force:
 1889
 1890                        # add field to index
 1891                        self.index_additionnal_fields.append(info_id_sql)
 1892
 1893                        # Update field array
 1894                        if connexion_format in ["duckdb"]:
 1895                            update_info_field = f"""
 1896                            "{info_id_sql}" =
 1897                                CASE
 1898                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1899                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1900                                END
 1901                            """
 1902                        elif connexion_format in ["sqlite"]:
 1903                            update_info_field = f"""
 1904                                "{info_id_sql}" =
 1905                                    CASE
 1906                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1907                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1908                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1909                                    END
 1910                            """
 1911
 1912                        sql_info_alter_table_array.append(update_info_field)
 1913
 1914            if sql_info_alter_table_array:
 1915
 1916                # By chromosomes
 1917                try:
 1918                    chromosomes_list = list(
 1919                        self.get_query_to_df(
 1920                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1921                        )["#CHROM"]
 1922                    )
 1923                except:
 1924                    chromosomes_list = [None]
 1925
 1926                for chrom in chromosomes_list:
 1927                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1928
 1929                    # Where clause
 1930                    where_clause = ""
 1931                    if chrom and len(chromosomes_list) > 1:
 1932                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1933
 1934                    # Update table
 1935                    if proccess_all_fields_together:
 1936                        sql_info_alter_table_array_join = ", ".join(
 1937                            sql_info_alter_table_array
 1938                        )
 1939                        if sql_info_alter_table_array_join:
 1940                            sql_info_alter_table = f"""
 1941                                UPDATE {table_variants}
 1942                                SET {sql_info_alter_table_array_join}
 1943                                {where_clause}
 1944                                """
 1945                            log.debug(
 1946                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1947                            )
 1948                            # log.debug(sql_info_alter_table)
 1949                            self.conn.execute(sql_info_alter_table)
 1950                    else:
 1951                        sql_info_alter_num = 0
 1952                        for sql_info_alter in sql_info_alter_table_array:
 1953                            sql_info_alter_num += 1
 1954                            sql_info_alter_table = f"""
 1955                                UPDATE {table_variants}
 1956                                SET {sql_info_alter}
 1957                                {where_clause}
 1958                                """
 1959                            log.debug(
 1960                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1961                            )
 1962                            # log.debug(sql_info_alter_table)
 1963                            self.conn.execute(sql_info_alter_table)
 1964
 1965        # create indexes
 1966        if create_index:
 1967            self.create_indexes()
 1968
 1969        return added_columns
 1970
 1971    def create_indexes(self) -> None:
 1972        """
 1973        Create indexes on the table after insertion
 1974        """
 1975
 1976        # Access
 1977        access = self.get_config().get("access", None)
 1978
 1979        # get table variants
 1980        table_variants = self.get_table_variants("FROM")
 1981
 1982        if self.get_indexing() and access not in ["RO"]:
 1983            # Create index
 1984            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1985            self.conn.execute(sql_create_table_index)
 1986            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1987            self.conn.execute(sql_create_table_index)
 1988            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1989            self.conn.execute(sql_create_table_index)
 1990            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1991            self.conn.execute(sql_create_table_index)
 1992            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1993            self.conn.execute(sql_create_table_index)
 1994            for field in self.index_additionnal_fields:
 1995                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1996                self.conn.execute(sql_create_table_index)
 1997
 1998    def drop_indexes(self) -> None:
 1999        """
 2000        Create indexes on the table after insertion
 2001        """
 2002
 2003        # Access
 2004        access = self.get_config().get("access", None)
 2005
 2006        # get table variants
 2007        table_variants = self.get_table_variants("FROM")
 2008
 2009        # Get database format
 2010        connexion_format = self.get_connexion_format()
 2011
 2012        if access not in ["RO"]:
 2013            if connexion_format in ["duckdb"]:
 2014                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2015            elif connexion_format in ["sqlite"]:
 2016                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2017
 2018            list_indexes = self.conn.execute(sql_list_indexes)
 2019            index_names = [row[0] for row in list_indexes.fetchall()]
 2020            for index in index_names:
 2021                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2022                self.conn.execute(sql_drop_table_index)
 2023
 2024    def read_vcf_header(self, f) -> list:
 2025        """
 2026        It reads the header of a VCF file and returns a list of the header lines
 2027
 2028        :param f: the file object
 2029        :return: The header lines of the VCF file.
 2030        """
 2031
 2032        header_list = []
 2033        for line in f:
 2034            header_list.append(line)
 2035            if line.startswith("#CHROM"):
 2036                break
 2037        return header_list
 2038
 2039    def read_vcf_header_file(self, file: str = None) -> list:
 2040        """
 2041        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2042        uncompressed files.
 2043
 2044        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2045        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2046        default to `None`
 2047        :type file: str
 2048        :return: The function `read_vcf_header_file` returns a list.
 2049        """
 2050
 2051        if self.get_input_compressed(input_file=file):
 2052            with bgzf.open(file, "rt") as f:
 2053                return self.read_vcf_header(f=f)
 2054        else:
 2055            with open(file, "rt") as f:
 2056                return self.read_vcf_header(f=f)
 2057
 2058    def execute_query(self, query: str):
 2059        """
 2060        It takes a query as an argument, executes it, and returns the results
 2061
 2062        :param query: The query to be executed
 2063        :return: The result of the query is being returned.
 2064        """
 2065        if query:
 2066            return self.conn.execute(query)  # .fetchall()
 2067        else:
 2068            return None
 2069
 2070    def export_output(
 2071        self,
 2072        output_file: str | None = None,
 2073        output_header: str | None = None,
 2074        export_header: bool = True,
 2075        query: str | None = None,
 2076        parquet_partitions: list | None = None,
 2077        chunk_size: int | None = None,
 2078        threads: int | None = None,
 2079        sort: bool = False,
 2080        index: bool = False,
 2081        order_by: str | None = None,
 2082    ) -> bool:
 2083        """
 2084        The `export_output` function exports data from a VCF file to a specified output file in various
 2085        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2086
 2087        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2088        output file to be generated by the function. This is where the exported data will be saved
 2089        :type output_file: str
 2090        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2091        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2092        header will be exported to a file with the same name as the `output_file` parameter, but with
 2093        the extension "
 2094        :type output_header: str
 2095        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2096        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2097        True, the header will be exported to a file. If `export_header` is False, the header will not
 2098        be, defaults to True, if output format is not VCF
 2099        :type export_header: bool (optional)
 2100        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2101        select specific data from the VCF file before exporting it. If provided, only the data that
 2102        matches the query will be exported
 2103        :type query: str
 2104        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2105        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2106        organize data in a hierarchical directory structure based on the values of one or more columns.
 2107        This can improve query performance when working with large datasets
 2108        :type parquet_partitions: list
 2109        :param chunk_size: The `chunk_size` parameter specifies the number of
 2110        records in batch when exporting data in Parquet format. This parameter is used for
 2111        partitioning the Parquet file into multiple files.
 2112        :type chunk_size: int
 2113        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2114        threads to be used during the export process. It determines the level of parallelism and can
 2115        improve the performance of the export operation. If not provided, the function will use the
 2116        default number of threads
 2117        :type threads: int
 2118        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2119        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2120        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2121        False
 2122        :type sort: bool (optional)
 2123        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2124        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2125        no index will be created. The default value is False, defaults to False
 2126        :type index: bool (optional)
 2127        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2128        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2129        :type order_by: str
 2130        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2131        None if it doesn't.
 2132        """
 2133
 2134        # Log
 2135        log.info("Exporting...")
 2136
 2137        # Full path
 2138        output_file = full_path(output_file)
 2139        output_header = full_path(output_header)
 2140
 2141        # Config
 2142        config = self.get_config()
 2143
 2144        # Param
 2145        param = self.get_param()
 2146
 2147        # Tmp files to remove
 2148        tmp_to_remove = []
 2149
 2150        # If no output, get it
 2151        if not output_file:
 2152            output_file = self.get_output()
 2153
 2154        # If not threads
 2155        if not threads:
 2156            threads = self.get_threads()
 2157
 2158        # Auto header name with extension
 2159        if export_header or output_header:
 2160            if not output_header:
 2161                output_header = f"{output_file}.hdr"
 2162            # Export header
 2163            self.export_header(output_file=output_file)
 2164
 2165        # Switch off export header if VCF output
 2166        output_file_type = get_file_format(output_file)
 2167        if output_file_type in ["vcf"]:
 2168            export_header = False
 2169            tmp_to_remove.append(output_header)
 2170
 2171        # Chunk size
 2172        if not chunk_size:
 2173            chunk_size = config.get("chunk_size", None)
 2174
 2175        # Parquet partition
 2176        if not parquet_partitions:
 2177            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2178        if parquet_partitions and isinstance(parquet_partitions, str):
 2179            parquet_partitions = parquet_partitions.split(",")
 2180
 2181        # Order by
 2182        if not order_by:
 2183            order_by = param.get("export", {}).get("order_by", "")
 2184
 2185        # Header in output
 2186        header_in_output = param.get("export", {}).get("include_header", False)
 2187
 2188        # Database
 2189        database_source = self.get_connexion()
 2190
 2191        # Connexion format
 2192        connexion_format = self.get_connexion_format()
 2193
 2194        # Explode infos
 2195        if self.get_explode_infos():
 2196            self.explode_infos(
 2197                prefix=self.get_explode_infos_prefix(),
 2198                fields=self.get_explode_infos_fields(),
 2199                force=False,
 2200            )
 2201
 2202        # if connexion_format in ["sqlite"] or query:
 2203        if connexion_format in ["sqlite"]:
 2204
 2205            # Export in Parquet
 2206            random_tmp = "".join(
 2207                random.choice(string.ascii_lowercase) for i in range(10)
 2208            )
 2209            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2210            tmp_to_remove.append(database_source)
 2211
 2212            # Table Variants
 2213            table_variants = self.get_table_variants()
 2214
 2215            # Create export query
 2216            sql_query_export_subquery = f"""
 2217                SELECT * FROM {table_variants}
 2218                """
 2219
 2220            # Write source file
 2221            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2222
 2223        # Create database
 2224        database = Database(
 2225            database=database_source,
 2226            table="variants",
 2227            header_file=output_header,
 2228            conn_config=self.get_connexion_config(),
 2229        )
 2230
 2231        # Existing colomns header
 2232        existing_columns_header = database.get_header_columns_from_database()
 2233
 2234        # Sample list
 2235        if output_file_type in ["vcf"]:
 2236            get_samples = self.get_samples()
 2237            get_samples_check = self.get_samples_check()
 2238            samples_force = get_samples is not None
 2239            sample_list = self.get_header_sample_list(
 2240                check=get_samples_check,
 2241                samples=get_samples,
 2242                samples_force=samples_force,
 2243            )
 2244        else:
 2245            sample_list = None
 2246
 2247        # Export file
 2248        database.export(
 2249            output_database=output_file,
 2250            output_header=output_header,
 2251            existing_columns_header=existing_columns_header,
 2252            parquet_partitions=parquet_partitions,
 2253            chunk_size=chunk_size,
 2254            threads=threads,
 2255            sort=sort,
 2256            index=index,
 2257            header_in_output=header_in_output,
 2258            order_by=order_by,
 2259            query=query,
 2260            export_header=export_header,
 2261            sample_list=sample_list,
 2262        )
 2263
 2264        # Remove
 2265        remove_if_exists(tmp_to_remove)
 2266
 2267        return (os.path.exists(output_file) or None) and (
 2268            os.path.exists(output_file) or None
 2269        )
 2270
 2271    def get_extra_infos(self, table: str = None) -> list:
 2272        """
 2273        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2274        in the header.
 2275
 2276        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2277        name of the table from which you want to retrieve the extra columns that are not present in the
 2278        header. If the `table` parameter is not provided when calling the function, it will default to
 2279        using the variants
 2280        :type table: str
 2281        :return: A list of columns that are in the specified table but not in the header of the table.
 2282        """
 2283
 2284        header_columns = []
 2285
 2286        if not table:
 2287            table = self.get_table_variants(clause="from")
 2288            header_columns = self.get_header_columns()
 2289
 2290        # Check all columns in the database
 2291        query = f""" SELECT * FROM {table} LIMIT 1 """
 2292        log.debug(f"query {query}")
 2293        table_columns = self.get_query_to_df(query).columns.tolist()
 2294        extra_columns = []
 2295
 2296        # Construct extra infos (not in header)
 2297        for column in table_columns:
 2298            if column not in header_columns:
 2299                extra_columns.append(column)
 2300
 2301        return extra_columns
 2302
 2303    def get_extra_infos_sql(self, table: str = None) -> str:
 2304        """
 2305        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2306        by double quotes
 2307
 2308        :param table: The name of the table to get the extra infos from. If None, the default table is
 2309        used
 2310        :type table: str
 2311        :return: A string of the extra infos
 2312        """
 2313
 2314        return ", ".join(
 2315            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2316        )
 2317
 2318    def export_header(
 2319        self,
 2320        header_name: str = None,
 2321        output_file: str = None,
 2322        output_file_ext: str = ".hdr",
 2323        clean_header: bool = True,
 2324        remove_chrom_line: bool = False,
 2325    ) -> str:
 2326        """
 2327        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2328        specified options, and writes it to a new file.
 2329
 2330        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2331        this parameter is not specified, the header will be written to the output file
 2332        :type header_name: str
 2333        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2334        specify the name of the output file where the header will be written. If this parameter is not
 2335        provided, the header will be written to a temporary file
 2336        :type output_file: str
 2337        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2338        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2339        if not specified by the user. This extension will be appended to the `output_file` name to
 2340        create the final, defaults to .hdr
 2341        :type output_file_ext: str (optional)
 2342        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2343        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2344        `True`, the function will clean the header by modifying certain lines based on a specific
 2345        pattern. If `clean_header`, defaults to True
 2346        :type clean_header: bool (optional)
 2347        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2348        boolean flag that determines whether the #CHROM line should be removed from the header before
 2349        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2350        defaults to False
 2351        :type remove_chrom_line: bool (optional)
 2352        :return: The function `export_header` returns the name of the temporary header file that is
 2353        created.
 2354        """
 2355
 2356        if not header_name and not output_file:
 2357            output_file = self.get_output()
 2358
 2359        if self.get_header():
 2360
 2361            # Get header object
 2362            header_obj = self.get_header()
 2363
 2364            # Create database
 2365            db_for_header = Database(database=self.get_input())
 2366
 2367            # Get real columns in the file
 2368            db_header_columns = db_for_header.get_columns()
 2369
 2370            with tempfile.TemporaryDirectory() as tmpdir:
 2371
 2372                # Write header file
 2373                header_file_tmp = os.path.join(tmpdir, "header")
 2374                f = open(header_file_tmp, "w")
 2375                vcf.Writer(f, header_obj)
 2376                f.close()
 2377
 2378                # Replace #CHROM line with rel columns
 2379                header_list = db_for_header.read_header_file(
 2380                    header_file=header_file_tmp
 2381                )
 2382                header_list[-1] = "\t".join(db_header_columns)
 2383
 2384                # Remove CHROM line
 2385                if remove_chrom_line:
 2386                    header_list.pop()
 2387
 2388                # Clean header
 2389                if clean_header:
 2390                    header_list_clean = []
 2391                    for head in header_list:
 2392                        # Clean head for malformed header
 2393                        head_clean = head
 2394                        head_clean = re.subn(
 2395                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2396                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2397                            head_clean,
 2398                            2,
 2399                        )[0]
 2400                        # Write header
 2401                        header_list_clean.append(head_clean)
 2402                    header_list = header_list_clean
 2403
 2404            tmp_header_name = output_file + output_file_ext
 2405
 2406            f = open(tmp_header_name, "w")
 2407            for line in header_list:
 2408                f.write(line)
 2409            f.close()
 2410
 2411        return tmp_header_name
 2412
 2413    def export_variant_vcf(
 2414        self,
 2415        vcf_file,
 2416        remove_info: bool = False,
 2417        add_samples: bool = True,
 2418        list_samples: list = [],
 2419        where_clause: str = "",
 2420        index: bool = False,
 2421        threads: int | None = None,
 2422    ) -> bool | None:
 2423        """
 2424        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2425        remove INFO field, add samples, and control compression and indexing.
 2426
 2427        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2428        written to. It is the output file that will contain the filtered VCF data based on the specified
 2429        parameters
 2430        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2431        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2432        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2433        in, defaults to False
 2434        :type remove_info: bool (optional)
 2435        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2436        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2437        If set to False, the samples will be removed. The default value is True, defaults to True
 2438        :type add_samples: bool (optional)
 2439        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2440        in the output VCF file. By default, all samples will be included. If you provide a list of
 2441        samples, only those samples will be included in the output file
 2442        :type list_samples: list
 2443        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2444        determines whether or not to create an index for the output VCF file. If `index` is set to
 2445        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2446        :type index: bool (optional)
 2447        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2448        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2449        will be used during the export process. More threads can potentially speed up the export process
 2450        by utilizing multiple cores of the processor. If
 2451        :type threads: int | None
 2452        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2453        method with various parameters including the output file, query, threads, sort flag, and index
 2454        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2455        specified parameters and configurations provided in the `export_variant_vcf` function.
 2456        """
 2457
 2458        # Config
 2459        config = self.get_config()
 2460
 2461        # Extract VCF
 2462        log.debug("Export VCF...")
 2463
 2464        # Table variants
 2465        table_variants = self.get_table_variants()
 2466
 2467        # Threads
 2468        if not threads:
 2469            threads = self.get_threads()
 2470
 2471        # Info fields
 2472        if remove_info:
 2473            if not isinstance(remove_info, str):
 2474                remove_info = "."
 2475            info_field = f"""'{remove_info}' as INFO"""
 2476        else:
 2477            info_field = "INFO"
 2478
 2479        # Samples fields
 2480        if add_samples:
 2481            if not list_samples:
 2482                list_samples = self.get_header_sample_list()
 2483            if list_samples:
 2484                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2485            else:
 2486                samples_fields = ""
 2487            log.debug(f"samples_fields: {samples_fields}")
 2488        else:
 2489            samples_fields = ""
 2490
 2491        # Where clause
 2492        if where_clause is None:
 2493            where_clause = ""
 2494
 2495        # Variants
 2496        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2497        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2498        log.debug(f"sql_query_select={sql_query_select}")
 2499
 2500        return self.export_output(
 2501            output_file=vcf_file,
 2502            output_header=None,
 2503            export_header=True,
 2504            query=sql_query_select,
 2505            parquet_partitions=None,
 2506            chunk_size=config.get("chunk_size", None),
 2507            threads=threads,
 2508            sort=True,
 2509            index=index,
 2510            order_by=None,
 2511        )
 2512
 2513    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2514        """
 2515        It takes a list of commands and runs them in parallel using the number of threads specified
 2516
 2517        :param commands: A list of commands to run
 2518        :param threads: The number of threads to use, defaults to 1 (optional)
 2519        """
 2520
 2521        run_parallel_commands(commands, threads)
 2522
 2523    def get_threads(self, default: int = 1) -> int:
 2524        """
 2525        This function returns the number of threads to use for a job, with a default value of 1 if not
 2526        specified.
 2527
 2528        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2529        default number of threads to use if no specific value is provided. If no value is provided for
 2530        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2531        used, defaults to 1
 2532        :type default: int (optional)
 2533        :return: the number of threads to use for the current job.
 2534        """
 2535
 2536        # Config
 2537        config = self.get_config()
 2538
 2539        # Param
 2540        param = self.get_param()
 2541
 2542        # Input threads
 2543        input_thread = param.get("threads", config.get("threads", None))
 2544
 2545        # Check threads
 2546        if not input_thread:
 2547            threads = default
 2548        elif int(input_thread) <= 0:
 2549            threads = os.cpu_count()
 2550        else:
 2551            threads = int(input_thread)
 2552        return threads
 2553
 2554    def get_memory(self, default: str = None) -> str:
 2555        """
 2556        This function retrieves the memory value from parameters or configuration with a default value
 2557        if not found.
 2558
 2559        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2560        default value is used as a fallback in case the `memory` parameter is not provided in the
 2561        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2562        the function
 2563        :type default: str
 2564        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2565        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2566        return the default value provided as an argument to the function.
 2567        """
 2568
 2569        # Config
 2570        config = self.get_config()
 2571
 2572        # Param
 2573        param = self.get_param()
 2574
 2575        # Input threads
 2576        input_memory = param.get("memory", config.get("memory", None))
 2577
 2578        # Check threads
 2579        if input_memory:
 2580            memory = input_memory
 2581        else:
 2582            memory = default
 2583
 2584        return memory
 2585
 2586    def update_from_vcf(self, vcf_file: str) -> None:
 2587        """
 2588        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2589
 2590        :param vcf_file: the path to the VCF file
 2591        """
 2592
 2593        connexion_format = self.get_connexion_format()
 2594
 2595        if connexion_format in ["duckdb"]:
 2596            self.update_from_vcf_duckdb(vcf_file)
 2597        elif connexion_format in ["sqlite"]:
 2598            self.update_from_vcf_sqlite(vcf_file)
 2599
 2600    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2601        """
 2602        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2603        INFO column of the VCF file
 2604
 2605        :param vcf_file: the path to the VCF file
 2606        """
 2607
 2608        # varaints table
 2609        table_variants = self.get_table_variants()
 2610
 2611        # Loading VCF into temporaire table
 2612        skip = self.get_header_length(file=vcf_file)
 2613        vcf_df = pd.read_csv(
 2614            vcf_file,
 2615            sep="\t",
 2616            engine="c",
 2617            skiprows=skip,
 2618            header=0,
 2619            low_memory=False,
 2620        )
 2621        sql_query_update = f"""
 2622        UPDATE {table_variants} as table_variants
 2623            SET INFO = concat(
 2624                            CASE
 2625                                WHEN INFO NOT IN ('', '.')
 2626                                THEN INFO
 2627                                ELSE ''
 2628                            END,
 2629                            (
 2630                                SELECT 
 2631                                    concat(
 2632                                        CASE
 2633                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2634                                            THEN ';'
 2635                                            ELSE ''
 2636                                        END
 2637                                        ,
 2638                                        CASE
 2639                                            WHEN table_parquet.INFO NOT IN ('','.')
 2640                                            THEN table_parquet.INFO
 2641                                            ELSE ''
 2642                                        END
 2643                                    )
 2644                                FROM vcf_df as table_parquet
 2645                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2646                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2647                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2648                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2649                                        AND table_parquet.INFO NOT IN ('','.')
 2650                            )
 2651                        )
 2652            ;
 2653            """
 2654        self.conn.execute(sql_query_update)
 2655
 2656    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2657        """
 2658        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2659        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2660        table
 2661
 2662        :param vcf_file: The path to the VCF file you want to update the database with
 2663        """
 2664
 2665        # Create a temporary table for the VCF
 2666        table_vcf = "tmp_vcf"
 2667        sql_create = (
 2668            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2669        )
 2670        self.conn.execute(sql_create)
 2671
 2672        # Loading VCF into temporaire table
 2673        vcf_df = pd.read_csv(
 2674            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2675        )
 2676        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2677        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2678
 2679        # Update table 'variants' with VCF data
 2680        # warning: CONCAT as || operator
 2681        sql_query_update = f"""
 2682            UPDATE variants as table_variants
 2683            SET INFO = CASE
 2684                            WHEN INFO NOT IN ('', '.')
 2685                            THEN INFO
 2686                            ELSE ''
 2687                        END ||
 2688                        (
 2689                        SELECT 
 2690                            CASE 
 2691                                WHEN table_variants.INFO NOT IN ('','.') 
 2692                                    AND table_vcf.INFO NOT IN ('','.')  
 2693                                THEN ';' 
 2694                                ELSE '' 
 2695                            END || 
 2696                            CASE 
 2697                                WHEN table_vcf.INFO NOT IN ('','.') 
 2698                                THEN table_vcf.INFO 
 2699                                ELSE '' 
 2700                            END
 2701                        FROM {table_vcf} as table_vcf
 2702                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2703                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2704                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2705                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2706                        )
 2707        """
 2708        self.conn.execute(sql_query_update)
 2709
 2710        # Drop temporary table
 2711        sql_drop = f"DROP TABLE {table_vcf}"
 2712        self.conn.execute(sql_drop)
 2713
 2714    def drop_variants_table(self) -> None:
 2715        """
 2716        > This function drops the variants table
 2717        """
 2718
 2719        table_variants = self.get_table_variants()
 2720        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2721        self.conn.execute(sql_table_variants)
 2722
 2723    def set_variant_id(
 2724        self, variant_id_column: str = "variant_id", force: bool = None
 2725    ) -> str:
 2726        """
 2727        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2728        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2729
 2730        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2731        to variant_id
 2732        :type variant_id_column: str (optional)
 2733        :param force: If True, the variant_id column will be created even if it already exists
 2734        :type force: bool
 2735        :return: The name of the column that contains the variant_id
 2736        """
 2737
 2738        # Assembly
 2739        assembly = self.get_param().get(
 2740            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2741        )
 2742
 2743        # INFO/Tag prefix
 2744        prefix = self.get_explode_infos_prefix()
 2745
 2746        # Explode INFO/SVTYPE
 2747        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2748
 2749        # variants table
 2750        table_variants = self.get_table_variants()
 2751
 2752        # variant_id column
 2753        if not variant_id_column:
 2754            variant_id_column = "variant_id"
 2755
 2756        # Creta variant_id column
 2757        if "variant_id" not in self.get_extra_infos() or force:
 2758
 2759            # Create column
 2760            self.add_column(
 2761                table_name=table_variants,
 2762                column_name=variant_id_column,
 2763                column_type="UBIGINT",
 2764                default_value="0",
 2765            )
 2766
 2767            # Update column
 2768            self.conn.execute(
 2769                f"""
 2770                    UPDATE {table_variants}
 2771                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2772                """
 2773            )
 2774
 2775        # Remove added columns
 2776        for added_column in added_columns:
 2777            self.drop_column(column=added_column)
 2778
 2779        # return variant_id column name
 2780        return variant_id_column
 2781
 2782    def get_variant_id_column(
 2783        self, variant_id_column: str = "variant_id", force: bool = None
 2784    ) -> str:
 2785        """
 2786        This function returns the variant_id column name
 2787
 2788        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2789        defaults to variant_id
 2790        :type variant_id_column: str (optional)
 2791        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2792        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2793        if it is not already set, or if it is set
 2794        :type force: bool
 2795        :return: The variant_id column name.
 2796        """
 2797
 2798        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2799
 2800    ###
 2801    # Annotation
 2802    ###
 2803
 2804    def scan_databases(
 2805        self,
 2806        database_formats: list = ["parquet"],
 2807        database_releases: list = ["current"],
 2808    ) -> dict:
 2809        """
 2810        The function `scan_databases` scans for available databases based on specified formats and
 2811        releases.
 2812
 2813        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2814        of the databases to be scanned. In this case, the accepted format is "parquet"
 2815        :type database_formats: list ["parquet"]
 2816        :param database_releases: The `database_releases` parameter is a list that specifies the
 2817        releases of the databases to be scanned. In the provided function, the default value for
 2818        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2819        databases that are in the "current"
 2820        :type database_releases: list
 2821        :return: The function `scan_databases` returns a dictionary containing information about
 2822        databases that match the specified formats and releases.
 2823        """
 2824
 2825        # Config
 2826        config = self.get_config()
 2827
 2828        # Param
 2829        param = self.get_param()
 2830
 2831        # Param - Assembly
 2832        assembly = param.get("assembly", config.get("assembly", None))
 2833        if not assembly:
 2834            assembly = DEFAULT_ASSEMBLY
 2835            log.warning(f"Default assembly '{assembly}'")
 2836
 2837        # Scan for availabled databases
 2838        log.info(
 2839            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2840        )
 2841        databases_infos_dict = databases_infos(
 2842            database_folder_releases=database_releases,
 2843            database_formats=database_formats,
 2844            assembly=assembly,
 2845            config=config,
 2846        )
 2847        log.info(
 2848            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2849        )
 2850
 2851        return databases_infos_dict
 2852
 2853    def annotation(self) -> None:
 2854        """
 2855        It annotates the VCF file with the annotations specified in the config file.
 2856        """
 2857
 2858        # Config
 2859        config = self.get_config()
 2860
 2861        # Param
 2862        param = self.get_param()
 2863
 2864        # Param - Assembly
 2865        assembly = param.get("assembly", config.get("assembly", None))
 2866        if not assembly:
 2867            assembly = DEFAULT_ASSEMBLY
 2868            log.warning(f"Default assembly '{assembly}'")
 2869
 2870        # annotations databases folders
 2871        annotations_databases = set(
 2872            config.get("folders", {})
 2873            .get("databases", {})
 2874            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2875            + config.get("folders", {})
 2876            .get("databases", {})
 2877            .get("parquet", ["~/howard/databases/parquet/current"])
 2878            + config.get("folders", {})
 2879            .get("databases", {})
 2880            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2881        )
 2882
 2883        # Get param annotations
 2884        if param.get("annotations", None) and isinstance(
 2885            param.get("annotations", None), str
 2886        ):
 2887            log.debug(param.get("annotations", None))
 2888            param_annotation_list = param.get("annotations").split(",")
 2889        else:
 2890            param_annotation_list = []
 2891
 2892        # Each tools param
 2893        if param.get("annotation_parquet", None) != None:
 2894            log.debug(
 2895                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2896            )
 2897            if isinstance(param.get("annotation_parquet", None), list):
 2898                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2899            else:
 2900                param_annotation_list.append(param.get("annotation_parquet"))
 2901        if param.get("annotation_snpsift", None) != None:
 2902            if isinstance(param.get("annotation_snpsift", None), list):
 2903                param_annotation_list.append(
 2904                    "snpsift:"
 2905                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2906                )
 2907            else:
 2908                param_annotation_list.append(
 2909                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2910                )
 2911        if param.get("annotation_snpeff", None) != None:
 2912            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2913        if param.get("annotation_bcftools", None) != None:
 2914            if isinstance(param.get("annotation_bcftools", None), list):
 2915                param_annotation_list.append(
 2916                    "bcftools:"
 2917                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2918                )
 2919            else:
 2920                param_annotation_list.append(
 2921                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2922                )
 2923        if param.get("annotation_annovar", None) != None:
 2924            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2925        if param.get("annotation_exomiser", None) != None:
 2926            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2927        if param.get("annotation_splice", None) != None:
 2928            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2929
 2930        # Merge param annotations list
 2931        param["annotations"] = ",".join(param_annotation_list)
 2932
 2933        # debug
 2934        log.debug(f"param_annotations={param['annotations']}")
 2935
 2936        if param.get("annotations"):
 2937
 2938            # Log
 2939            # log.info("Annotations - Check annotation parameters")
 2940
 2941            if not "annotation" in param:
 2942                param["annotation"] = {}
 2943
 2944            # List of annotations parameters
 2945            annotations_list_input = {}
 2946            if isinstance(param.get("annotations", None), str):
 2947                annotation_file_list = [
 2948                    value for value in param.get("annotations", "").split(",")
 2949                ]
 2950                for annotation_file in annotation_file_list:
 2951                    annotations_list_input[annotation_file] = {"INFO": None}
 2952            else:
 2953                annotations_list_input = param.get("annotations", {})
 2954
 2955            log.info(f"Quick Annotations:")
 2956            for annotation_key in list(annotations_list_input.keys()):
 2957                log.info(f"   {annotation_key}")
 2958
 2959            # List of annotations and associated fields
 2960            annotations_list = {}
 2961
 2962            for annotation_file in annotations_list_input:
 2963
 2964                # Explode annotations if ALL
 2965                if (
 2966                    annotation_file.upper() == "ALL"
 2967                    or annotation_file.upper().startswith("ALL:")
 2968                ):
 2969
 2970                    # check ALL parameters (formats, releases)
 2971                    annotation_file_split = annotation_file.split(":")
 2972                    database_formats = "parquet"
 2973                    database_releases = "current"
 2974                    for annotation_file_option in annotation_file_split[1:]:
 2975                        database_all_options_split = annotation_file_option.split("=")
 2976                        if database_all_options_split[0] == "format":
 2977                            database_formats = database_all_options_split[1].split("+")
 2978                        if database_all_options_split[0] == "release":
 2979                            database_releases = database_all_options_split[1].split("+")
 2980
 2981                    # Scan for availabled databases
 2982                    databases_infos_dict = self.scan_databases(
 2983                        database_formats=database_formats,
 2984                        database_releases=database_releases,
 2985                    )
 2986
 2987                    # Add found databases in annotation parameters
 2988                    for database_infos in databases_infos_dict.keys():
 2989                        annotations_list[database_infos] = {"INFO": None}
 2990
 2991                else:
 2992                    annotations_list[annotation_file] = annotations_list_input[
 2993                        annotation_file
 2994                    ]
 2995
 2996            # Check each databases
 2997            if len(annotations_list):
 2998
 2999                log.info(
 3000                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3001                )
 3002
 3003                for annotation_file in annotations_list:
 3004
 3005                    # Init
 3006                    annotations = annotations_list.get(annotation_file, None)
 3007
 3008                    # Annotation snpEff
 3009                    if annotation_file.startswith("snpeff"):
 3010
 3011                        log.debug(f"Quick Annotation snpEff")
 3012
 3013                        if "snpeff" not in param["annotation"]:
 3014                            param["annotation"]["snpeff"] = {}
 3015
 3016                        if "options" not in param["annotation"]["snpeff"]:
 3017                            param["annotation"]["snpeff"]["options"] = ""
 3018
 3019                        # snpEff options in annotations
 3020                        param["annotation"]["snpeff"]["options"] = "".join(
 3021                            annotation_file.split(":")[1:]
 3022                        )
 3023
 3024                    # Annotation Annovar
 3025                    elif annotation_file.startswith("annovar"):
 3026
 3027                        log.debug(f"Quick Annotation Annovar")
 3028
 3029                        if "annovar" not in param["annotation"]:
 3030                            param["annotation"]["annovar"] = {}
 3031
 3032                        if "annotations" not in param["annotation"]["annovar"]:
 3033                            param["annotation"]["annovar"]["annotations"] = {}
 3034
 3035                        # Options
 3036                        annotation_file_split = annotation_file.split(":")
 3037                        for annotation_file_annotation in annotation_file_split[1:]:
 3038                            if annotation_file_annotation:
 3039                                param["annotation"]["annovar"]["annotations"][
 3040                                    annotation_file_annotation
 3041                                ] = annotations
 3042
 3043                    # Annotation Exomiser
 3044                    elif annotation_file.startswith("exomiser"):
 3045
 3046                        log.debug(f"Quick Annotation Exomiser")
 3047
 3048                        param["annotation"]["exomiser"] = params_string_to_dict(
 3049                            annotation_file
 3050                        )
 3051
 3052                    # Annotation Splice
 3053                    elif annotation_file.startswith("splice"):
 3054
 3055                        log.debug(f"Quick Annotation Splice")
 3056
 3057                        param["annotation"]["splice"] = params_string_to_dict(
 3058                            annotation_file
 3059                        )
 3060
 3061                    # Annotation Parquet or BCFTOOLS
 3062                    else:
 3063
 3064                        # Tools detection
 3065                        if annotation_file.startswith("bcftools:"):
 3066                            annotation_tool_initial = "bcftools"
 3067                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3068                        elif annotation_file.startswith("snpsift:"):
 3069                            annotation_tool_initial = "snpsift"
 3070                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3071                        else:
 3072                            annotation_tool_initial = None
 3073
 3074                        # list of files
 3075                        annotation_file_list = annotation_file.replace("+", ":").split(
 3076                            ":"
 3077                        )
 3078
 3079                        for annotation_file in annotation_file_list:
 3080
 3081                            if annotation_file:
 3082
 3083                                # Annotation tool initial
 3084                                annotation_tool = annotation_tool_initial
 3085
 3086                                # Find file
 3087                                annotation_file_found = None
 3088
 3089                                # Expand user
 3090                                annotation_file = full_path(annotation_file)
 3091
 3092                                if os.path.exists(annotation_file):
 3093                                    annotation_file_found = annotation_file
 3094
 3095                                else:
 3096                                    # Find within assembly folders
 3097                                    for annotations_database in annotations_databases:
 3098                                        found_files = find_all(
 3099                                            annotation_file,
 3100                                            os.path.join(
 3101                                                annotations_database, assembly
 3102                                            ),
 3103                                        )
 3104                                        if len(found_files) > 0:
 3105                                            annotation_file_found = found_files[0]
 3106                                            break
 3107                                    if not annotation_file_found and not assembly:
 3108                                        # Find within folders
 3109                                        for (
 3110                                            annotations_database
 3111                                        ) in annotations_databases:
 3112                                            found_files = find_all(
 3113                                                annotation_file, annotations_database
 3114                                            )
 3115                                            if len(found_files) > 0:
 3116                                                annotation_file_found = found_files[0]
 3117                                                break
 3118                                log.debug(
 3119                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3120                                )
 3121
 3122                                # Full path
 3123                                annotation_file_found = full_path(annotation_file_found)
 3124
 3125                                if annotation_file_found:
 3126
 3127                                    database = Database(database=annotation_file_found)
 3128                                    quick_annotation_format = database.get_format()
 3129                                    quick_annotation_is_compressed = (
 3130                                        database.is_compressed()
 3131                                    )
 3132                                    quick_annotation_is_indexed = os.path.exists(
 3133                                        f"{annotation_file_found}.tbi"
 3134                                    )
 3135                                    bcftools_preference = False
 3136
 3137                                    # Check Annotation Tool
 3138                                    if not annotation_tool:
 3139                                        if (
 3140                                            bcftools_preference
 3141                                            and quick_annotation_format
 3142                                            in ["vcf", "bed"]
 3143                                            and quick_annotation_is_compressed
 3144                                            and quick_annotation_is_indexed
 3145                                        ):
 3146                                            annotation_tool = "bcftools"
 3147                                        elif quick_annotation_format in [
 3148                                            "vcf",
 3149                                            "bed",
 3150                                            "tsv",
 3151                                            "tsv",
 3152                                            "csv",
 3153                                            "json",
 3154                                            "tbl",
 3155                                            "parquet",
 3156                                            "duckdb",
 3157                                        ]:
 3158                                            annotation_tool = "parquet"
 3159                                        else:
 3160                                            log.error(
 3161                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3162                                            )
 3163                                            raise ValueError(
 3164                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3165                                            )
 3166
 3167                                    log.debug(
 3168                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3169                                    )
 3170
 3171                                    # Annotation Tool dispatch
 3172                                    if annotation_tool:
 3173                                        if annotation_tool not in param["annotation"]:
 3174                                            param["annotation"][annotation_tool] = {}
 3175                                        if (
 3176                                            "annotations"
 3177                                            not in param["annotation"][annotation_tool]
 3178                                        ):
 3179                                            param["annotation"][annotation_tool][
 3180                                                "annotations"
 3181                                            ] = {}
 3182                                        param["annotation"][annotation_tool][
 3183                                            "annotations"
 3184                                        ][annotation_file_found] = annotations
 3185
 3186                                else:
 3187                                    log.error(
 3188                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3189                                    )
 3190
 3191                self.set_param(param)
 3192
 3193        if param.get("annotation", None):
 3194            log.info("Annotations")
 3195            if param.get("annotation", {}).get("parquet", None):
 3196                log.info("Annotations 'parquet'...")
 3197                self.annotation_parquet()
 3198            if param.get("annotation", {}).get("bcftools", None):
 3199                log.info("Annotations 'bcftools'...")
 3200                self.annotation_bcftools()
 3201            if param.get("annotation", {}).get("snpsift", None):
 3202                log.info("Annotations 'snpsift'...")
 3203                self.annotation_snpsift()
 3204            if param.get("annotation", {}).get("annovar", None):
 3205                log.info("Annotations 'annovar'...")
 3206                self.annotation_annovar()
 3207            if param.get("annotation", {}).get("snpeff", None):
 3208                log.info("Annotations 'snpeff'...")
 3209                self.annotation_snpeff()
 3210            if param.get("annotation", {}).get("exomiser", None) is not None:
 3211                log.info("Annotations 'exomiser'...")
 3212                self.annotation_exomiser()
 3213            if param.get("annotation", {}).get("splice", None) is not None:
 3214                log.info("Annotations 'splice' ...")
 3215                self.annotation_splice()
 3216
 3217        # Explode INFOS fields into table fields
 3218        if self.get_explode_infos():
 3219            self.explode_infos(
 3220                prefix=self.get_explode_infos_prefix(),
 3221                fields=self.get_explode_infos_fields(),
 3222                force=True,
 3223            )
 3224
 3225    def annotation_snpsift(self, threads: int = None) -> None:
 3226        """
 3227        This function annotate with bcftools
 3228
 3229        :param threads: Number of threads to use
 3230        :return: the value of the variable "return_value".
 3231        """
 3232
 3233        # DEBUG
 3234        log.debug("Start annotation with bcftools databases")
 3235
 3236        # Threads
 3237        if not threads:
 3238            threads = self.get_threads()
 3239        log.debug("Threads: " + str(threads))
 3240
 3241        # Config
 3242        config = self.get_config()
 3243        log.debug("Config: " + str(config))
 3244
 3245        # Config - snpSift
 3246        snpsift_bin_command = get_bin_command(
 3247            bin="SnpSift.jar",
 3248            tool="snpsift",
 3249            bin_type="jar",
 3250            config=config,
 3251            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3252        )
 3253        if not snpsift_bin_command:
 3254            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3255            log.error(msg_err)
 3256            raise ValueError(msg_err)
 3257
 3258        # Config - bcftools
 3259        bcftools_bin_command = get_bin_command(
 3260            bin="bcftools",
 3261            tool="bcftools",
 3262            bin_type="bin",
 3263            config=config,
 3264            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3265        )
 3266        if not bcftools_bin_command:
 3267            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3268            log.error(msg_err)
 3269            raise ValueError(msg_err)
 3270
 3271        # Config - BCFTools databases folders
 3272        databases_folders = set(
 3273            self.get_config()
 3274            .get("folders", {})
 3275            .get("databases", {})
 3276            .get("annotations", ["."])
 3277            + self.get_config()
 3278            .get("folders", {})
 3279            .get("databases", {})
 3280            .get("bcftools", ["."])
 3281        )
 3282        log.debug("Databases annotations: " + str(databases_folders))
 3283
 3284        # Param
 3285        annotations = (
 3286            self.get_param()
 3287            .get("annotation", {})
 3288            .get("snpsift", {})
 3289            .get("annotations", None)
 3290        )
 3291        log.debug("Annotations: " + str(annotations))
 3292
 3293        # Assembly
 3294        assembly = self.get_param().get(
 3295            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3296        )
 3297
 3298        # Data
 3299        table_variants = self.get_table_variants()
 3300
 3301        # Check if not empty
 3302        log.debug("Check if not empty")
 3303        sql_query_chromosomes = (
 3304            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3305        )
 3306        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3307        if not sql_query_chromosomes_df["count"][0]:
 3308            log.info(f"VCF empty")
 3309            return
 3310
 3311        # VCF header
 3312        vcf_reader = self.get_header()
 3313        log.debug("Initial header: " + str(vcf_reader.infos))
 3314
 3315        # Existing annotations
 3316        for vcf_annotation in self.get_header().infos:
 3317
 3318            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3319            log.debug(
 3320                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3321            )
 3322
 3323        if annotations:
 3324
 3325            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3326
 3327                # Export VCF file
 3328                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3329
 3330                # Init
 3331                commands = {}
 3332
 3333                for annotation in annotations:
 3334                    annotation_fields = annotations[annotation]
 3335
 3336                    # Annotation Name
 3337                    annotation_name = os.path.basename(annotation)
 3338
 3339                    if not annotation_fields:
 3340                        annotation_fields = {"INFO": None}
 3341
 3342                    log.debug(f"Annotation '{annotation_name}'")
 3343                    log.debug(
 3344                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3345                    )
 3346
 3347                    # Create Database
 3348                    database = Database(
 3349                        database=annotation,
 3350                        databases_folders=databases_folders,
 3351                        assembly=assembly,
 3352                    )
 3353
 3354                    # Find files
 3355                    db_file = database.get_database()
 3356                    db_file = full_path(db_file)
 3357                    db_hdr_file = database.get_header_file()
 3358                    db_hdr_file = full_path(db_hdr_file)
 3359                    db_file_type = database.get_format()
 3360                    db_tbi_file = f"{db_file}.tbi"
 3361                    db_file_compressed = database.is_compressed()
 3362
 3363                    # Check if compressed
 3364                    if not db_file_compressed:
 3365                        log.error(
 3366                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3367                        )
 3368                        raise ValueError(
 3369                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3370                        )
 3371
 3372                    # Check if indexed
 3373                    if not os.path.exists(db_tbi_file):
 3374                        log.error(
 3375                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3376                        )
 3377                        raise ValueError(
 3378                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3379                        )
 3380
 3381                    # Check index - try to create if not exists
 3382                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3383                        log.error("Annotation failed: database not valid")
 3384                        log.error(f"Annotation annotation file: {db_file}")
 3385                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3386                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3387                        raise ValueError(
 3388                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3389                        )
 3390                    else:
 3391
 3392                        log.debug(
 3393                            f"Annotation '{annotation}' - file: "
 3394                            + str(db_file)
 3395                            + " and "
 3396                            + str(db_hdr_file)
 3397                        )
 3398
 3399                        # Load header as VCF object
 3400                        db_hdr_vcf = Variants(input=db_hdr_file)
 3401                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3402                        log.debug(
 3403                            "Annotation database header: "
 3404                            + str(db_hdr_vcf_header_infos)
 3405                        )
 3406
 3407                        # For all fields in database
 3408                        annotation_fields_full = False
 3409                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3410                            annotation_fields = {
 3411                                key: key for key in db_hdr_vcf_header_infos
 3412                            }
 3413                            log.debug(
 3414                                "Annotation database header - All annotations added: "
 3415                                + str(annotation_fields)
 3416                            )
 3417                            annotation_fields_full = True
 3418
 3419                        # # Create file for field rename
 3420                        # log.debug("Create file for field rename")
 3421                        # tmp_rename = NamedTemporaryFile(
 3422                        #     prefix=self.get_prefix(),
 3423                        #     dir=self.get_tmp_dir(),
 3424                        #     suffix=".rename",
 3425                        #     delete=False,
 3426                        # )
 3427                        # tmp_rename_name = tmp_rename.name
 3428                        # tmp_files.append(tmp_rename_name)
 3429
 3430                        # Number of fields
 3431                        nb_annotation_field = 0
 3432                        annotation_list = []
 3433                        annotation_infos_rename_list = []
 3434
 3435                        for annotation_field in annotation_fields:
 3436
 3437                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3438                            annotation_fields_new_name = annotation_fields.get(
 3439                                annotation_field, annotation_field
 3440                            )
 3441                            if not annotation_fields_new_name:
 3442                                annotation_fields_new_name = annotation_field
 3443
 3444                            # Check if field is in DB and if field is not elready in input data
 3445                            if (
 3446                                annotation_field in db_hdr_vcf.get_header().infos
 3447                                and annotation_fields_new_name
 3448                                not in self.get_header().infos
 3449                            ):
 3450
 3451                                log.info(
 3452                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3453                                )
 3454
 3455                                # BCFTools annotate param to rename fields
 3456                                if annotation_field != annotation_fields_new_name:
 3457                                    annotation_infos_rename_list.append(
 3458                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3459                                    )
 3460
 3461                                # Add INFO field to header
 3462                                db_hdr_vcf_header_infos_number = (
 3463                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3464                                )
 3465                                db_hdr_vcf_header_infos_type = (
 3466                                    db_hdr_vcf_header_infos[annotation_field].type
 3467                                    or "String"
 3468                                )
 3469                                db_hdr_vcf_header_infos_description = (
 3470                                    db_hdr_vcf_header_infos[annotation_field].desc
 3471                                    or f"{annotation_field} description"
 3472                                )
 3473                                db_hdr_vcf_header_infos_source = (
 3474                                    db_hdr_vcf_header_infos[annotation_field].source
 3475                                    or "unknown"
 3476                                )
 3477                                db_hdr_vcf_header_infos_version = (
 3478                                    db_hdr_vcf_header_infos[annotation_field].version
 3479                                    or "unknown"
 3480                                )
 3481
 3482                                vcf_reader.infos[annotation_fields_new_name] = (
 3483                                    vcf.parser._Info(
 3484                                        annotation_fields_new_name,
 3485                                        db_hdr_vcf_header_infos_number,
 3486                                        db_hdr_vcf_header_infos_type,
 3487                                        db_hdr_vcf_header_infos_description,
 3488                                        db_hdr_vcf_header_infos_source,
 3489                                        db_hdr_vcf_header_infos_version,
 3490                                        self.code_type_map[
 3491                                            db_hdr_vcf_header_infos_type
 3492                                        ],
 3493                                    )
 3494                                )
 3495
 3496                                annotation_list.append(annotation_field)
 3497
 3498                                nb_annotation_field += 1
 3499
 3500                            else:
 3501
 3502                                if (
 3503                                    annotation_field
 3504                                    not in db_hdr_vcf.get_header().infos
 3505                                ):
 3506                                    log.warning(
 3507                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3508                                    )
 3509                                if (
 3510                                    annotation_fields_new_name
 3511                                    in self.get_header().infos
 3512                                ):
 3513                                    log.warning(
 3514                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3515                                    )
 3516
 3517                        log.info(
 3518                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3519                        )
 3520
 3521                        annotation_infos = ",".join(annotation_list)
 3522
 3523                        if annotation_infos != "":
 3524
 3525                            # Annotated VCF (and error file)
 3526                            tmp_annotation_vcf_name = os.path.join(
 3527                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3528                            )
 3529                            tmp_annotation_vcf_name_err = (
 3530                                tmp_annotation_vcf_name + ".err"
 3531                            )
 3532
 3533                            # Add fields to annotate
 3534                            if not annotation_fields_full:
 3535                                annotation_infos_option = f"-info {annotation_infos}"
 3536                            else:
 3537                                annotation_infos_option = ""
 3538
 3539                            # Info fields rename
 3540                            if annotation_infos_rename_list:
 3541                                annotation_infos_rename = " -c " + ",".join(
 3542                                    annotation_infos_rename_list
 3543                                )
 3544                            else:
 3545                                annotation_infos_rename = ""
 3546
 3547                            # Annotate command
 3548                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3549
 3550                            # Add command
 3551                            commands[command_annotate] = tmp_annotation_vcf_name
 3552
 3553                if commands:
 3554
 3555                    # Export VCF file
 3556                    self.export_variant_vcf(
 3557                        vcf_file=tmp_vcf_name,
 3558                        remove_info=True,
 3559                        add_samples=False,
 3560                        index=True,
 3561                    )
 3562                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3563
 3564                    # Num command
 3565                    nb_command = 0
 3566
 3567                    # Annotate
 3568                    for command_annotate in commands:
 3569                        nb_command += 1
 3570                        log.info(
 3571                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3572                        )
 3573                        log.debug(f"command_annotate={command_annotate}")
 3574                        run_parallel_commands([command_annotate], threads)
 3575
 3576                        # Debug
 3577                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3578
 3579                        # Update variants
 3580                        log.info(
 3581                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3582                        )
 3583                        self.update_from_vcf(commands[command_annotate])
 3584
 3585    def annotation_bcftools(self, threads: int = None) -> None:
 3586        """
 3587        This function annotate with bcftools
 3588
 3589        :param threads: Number of threads to use
 3590        :return: the value of the variable "return_value".
 3591        """
 3592
 3593        # DEBUG
 3594        log.debug("Start annotation with bcftools databases")
 3595
 3596        # Threads
 3597        if not threads:
 3598            threads = self.get_threads()
 3599        log.debug("Threads: " + str(threads))
 3600
 3601        # Config
 3602        config = self.get_config()
 3603        log.debug("Config: " + str(config))
 3604
 3605        # DEBUG
 3606        delete_tmp = True
 3607        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3608            delete_tmp = False
 3609            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3610
 3611        # Config - BCFTools bin command
 3612        bcftools_bin_command = get_bin_command(
 3613            bin="bcftools",
 3614            tool="bcftools",
 3615            bin_type="bin",
 3616            config=config,
 3617            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3618        )
 3619        if not bcftools_bin_command:
 3620            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3621            log.error(msg_err)
 3622            raise ValueError(msg_err)
 3623
 3624        # Config - BCFTools databases folders
 3625        databases_folders = set(
 3626            self.get_config()
 3627            .get("folders", {})
 3628            .get("databases", {})
 3629            .get("annotations", ["."])
 3630            + self.get_config()
 3631            .get("folders", {})
 3632            .get("databases", {})
 3633            .get("bcftools", ["."])
 3634        )
 3635        log.debug("Databases annotations: " + str(databases_folders))
 3636
 3637        # Param
 3638        annotations = (
 3639            self.get_param()
 3640            .get("annotation", {})
 3641            .get("bcftools", {})
 3642            .get("annotations", None)
 3643        )
 3644        log.debug("Annotations: " + str(annotations))
 3645
 3646        # Assembly
 3647        assembly = self.get_param().get(
 3648            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3649        )
 3650
 3651        # Data
 3652        table_variants = self.get_table_variants()
 3653
 3654        # Check if not empty
 3655        log.debug("Check if not empty")
 3656        sql_query_chromosomes = (
 3657            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3658        )
 3659        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3660        if not sql_query_chromosomes_df["count"][0]:
 3661            log.info(f"VCF empty")
 3662            return
 3663
 3664        # Export in VCF
 3665        log.debug("Create initial file to annotate")
 3666        tmp_vcf = NamedTemporaryFile(
 3667            prefix=self.get_prefix(),
 3668            dir=self.get_tmp_dir(),
 3669            suffix=".vcf.gz",
 3670            delete=False,
 3671        )
 3672        tmp_vcf_name = tmp_vcf.name
 3673
 3674        # VCF header
 3675        vcf_reader = self.get_header()
 3676        log.debug("Initial header: " + str(vcf_reader.infos))
 3677
 3678        # Existing annotations
 3679        for vcf_annotation in self.get_header().infos:
 3680
 3681            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3682            log.debug(
 3683                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3684            )
 3685
 3686        if annotations:
 3687
 3688            tmp_ann_vcf_list = []
 3689            commands = []
 3690            tmp_files = []
 3691            err_files = []
 3692
 3693            for annotation in annotations:
 3694                annotation_fields = annotations[annotation]
 3695
 3696                # Annotation Name
 3697                annotation_name = os.path.basename(annotation)
 3698
 3699                if not annotation_fields:
 3700                    annotation_fields = {"INFO": None}
 3701
 3702                log.debug(f"Annotation '{annotation_name}'")
 3703                log.debug(
 3704                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3705                )
 3706
 3707                # Create Database
 3708                database = Database(
 3709                    database=annotation,
 3710                    databases_folders=databases_folders,
 3711                    assembly=assembly,
 3712                )
 3713
 3714                # Find files
 3715                db_file = database.get_database()
 3716                db_file = full_path(db_file)
 3717                db_hdr_file = database.get_header_file()
 3718                db_hdr_file = full_path(db_hdr_file)
 3719                db_file_type = database.get_format()
 3720                db_tbi_file = f"{db_file}.tbi"
 3721                db_file_compressed = database.is_compressed()
 3722
 3723                # Check if compressed
 3724                if not db_file_compressed:
 3725                    log.error(
 3726                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3727                    )
 3728                    raise ValueError(
 3729                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3730                    )
 3731
 3732                # Check if indexed
 3733                if not os.path.exists(db_tbi_file):
 3734                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3735                    raise ValueError(
 3736                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3737                    )
 3738
 3739                # Check index - try to create if not exists
 3740                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3741                    log.error("Annotation failed: database not valid")
 3742                    log.error(f"Annotation annotation file: {db_file}")
 3743                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3744                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3745                    raise ValueError(
 3746                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3747                    )
 3748                else:
 3749
 3750                    log.debug(
 3751                        f"Annotation '{annotation}' - file: "
 3752                        + str(db_file)
 3753                        + " and "
 3754                        + str(db_hdr_file)
 3755                    )
 3756
 3757                    # Load header as VCF object
 3758                    db_hdr_vcf = Variants(input=db_hdr_file)
 3759                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3760                    log.debug(
 3761                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3762                    )
 3763
 3764                    # For all fields in database
 3765                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3766                        annotation_fields = {
 3767                            key: key for key in db_hdr_vcf_header_infos
 3768                        }
 3769                        log.debug(
 3770                            "Annotation database header - All annotations added: "
 3771                            + str(annotation_fields)
 3772                        )
 3773
 3774                    # Number of fields
 3775                    nb_annotation_field = 0
 3776                    annotation_list = []
 3777
 3778                    for annotation_field in annotation_fields:
 3779
 3780                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3781                        annotation_fields_new_name = annotation_fields.get(
 3782                            annotation_field, annotation_field
 3783                        )
 3784                        if not annotation_fields_new_name:
 3785                            annotation_fields_new_name = annotation_field
 3786
 3787                        # Check if field is in DB and if field is not elready in input data
 3788                        if (
 3789                            annotation_field in db_hdr_vcf.get_header().infos
 3790                            and annotation_fields_new_name
 3791                            not in self.get_header().infos
 3792                        ):
 3793
 3794                            log.info(
 3795                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3796                            )
 3797
 3798                            # Add INFO field to header
 3799                            db_hdr_vcf_header_infos_number = (
 3800                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3801                            )
 3802                            db_hdr_vcf_header_infos_type = (
 3803                                db_hdr_vcf_header_infos[annotation_field].type
 3804                                or "String"
 3805                            )
 3806                            db_hdr_vcf_header_infos_description = (
 3807                                db_hdr_vcf_header_infos[annotation_field].desc
 3808                                or f"{annotation_field} description"
 3809                            )
 3810                            db_hdr_vcf_header_infos_source = (
 3811                                db_hdr_vcf_header_infos[annotation_field].source
 3812                                or "unknown"
 3813                            )
 3814                            db_hdr_vcf_header_infos_version = (
 3815                                db_hdr_vcf_header_infos[annotation_field].version
 3816                                or "unknown"
 3817                            )
 3818
 3819                            vcf_reader.infos[annotation_fields_new_name] = (
 3820                                vcf.parser._Info(
 3821                                    annotation_fields_new_name,
 3822                                    db_hdr_vcf_header_infos_number,
 3823                                    db_hdr_vcf_header_infos_type,
 3824                                    db_hdr_vcf_header_infos_description,
 3825                                    db_hdr_vcf_header_infos_source,
 3826                                    db_hdr_vcf_header_infos_version,
 3827                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3828                                )
 3829                            )
 3830
 3831                            # annotation_list.append(annotation_field)
 3832                            if annotation_field != annotation_fields_new_name:
 3833                                annotation_list.append(
 3834                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3835                                )
 3836                            else:
 3837                                annotation_list.append(annotation_field)
 3838
 3839                            nb_annotation_field += 1
 3840
 3841                        else:
 3842
 3843                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3844                                log.warning(
 3845                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3846                                )
 3847                            if annotation_fields_new_name in self.get_header().infos:
 3848                                log.warning(
 3849                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3850                                )
 3851
 3852                    log.info(
 3853                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3854                    )
 3855
 3856                    annotation_infos = ",".join(annotation_list)
 3857
 3858                    if annotation_infos != "":
 3859
 3860                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3861                        log.debug("Protect Header file - remove #CHROM line if exists")
 3862                        tmp_header_vcf = NamedTemporaryFile(
 3863                            prefix=self.get_prefix(),
 3864                            dir=self.get_tmp_dir(),
 3865                            suffix=".hdr",
 3866                            delete=False,
 3867                        )
 3868                        tmp_header_vcf_name = tmp_header_vcf.name
 3869                        tmp_files.append(tmp_header_vcf_name)
 3870                        # Command
 3871                        if db_hdr_file.endswith(".gz"):
 3872                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3873                        else:
 3874                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3875                        # Run
 3876                        run_parallel_commands([command_extract_header], 1)
 3877
 3878                        # Find chomosomes
 3879                        log.debug("Find chromosomes ")
 3880                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3881                        sql_query_chromosomes_df = self.get_query_to_df(
 3882                            sql_query_chromosomes
 3883                        )
 3884                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3885
 3886                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3887
 3888                        # BED columns in the annotation file
 3889                        if db_file_type in ["bed"]:
 3890                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3891
 3892                        for chrom in chomosomes_list:
 3893
 3894                            # Create BED on initial VCF
 3895                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3896                            tmp_bed = NamedTemporaryFile(
 3897                                prefix=self.get_prefix(),
 3898                                dir=self.get_tmp_dir(),
 3899                                suffix=".bed",
 3900                                delete=False,
 3901                            )
 3902                            tmp_bed_name = tmp_bed.name
 3903                            tmp_files.append(tmp_bed_name)
 3904
 3905                            # Detecte regions
 3906                            log.debug(
 3907                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3908                            )
 3909                            window = 1000000
 3910                            sql_query_intervals_for_bed = f"""
 3911                                SELECT  \"#CHROM\",
 3912                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3913                                        \"POS\"+{window}
 3914                                FROM {table_variants} as table_variants
 3915                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3916                            """
 3917                            regions = self.conn.execute(
 3918                                sql_query_intervals_for_bed
 3919                            ).fetchall()
 3920                            merged_regions = merge_regions(regions)
 3921                            log.debug(
 3922                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3923                            )
 3924
 3925                            header = ["#CHROM", "START", "END"]
 3926                            with open(tmp_bed_name, "w") as f:
 3927                                # Write the header with tab delimiter
 3928                                f.write("\t".join(header) + "\n")
 3929                                for d in merged_regions:
 3930                                    # Write each data row with tab delimiter
 3931                                    f.write("\t".join(map(str, d)) + "\n")
 3932
 3933                            # Tmp files
 3934                            tmp_annotation_vcf = NamedTemporaryFile(
 3935                                prefix=self.get_prefix(),
 3936                                dir=self.get_tmp_dir(),
 3937                                suffix=".vcf.gz",
 3938                                delete=False,
 3939                            )
 3940                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3941                            tmp_files.append(tmp_annotation_vcf_name)
 3942                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3943                            tmp_annotation_vcf_name_err = (
 3944                                tmp_annotation_vcf_name + ".err"
 3945                            )
 3946                            err_files.append(tmp_annotation_vcf_name_err)
 3947
 3948                            # Annotate Command
 3949                            log.debug(
 3950                                f"Annotation '{annotation}' - add bcftools command"
 3951                            )
 3952
 3953                            # Command
 3954                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3955
 3956                            # Add command
 3957                            commands.append(command_annotate)
 3958
 3959            # if some commands
 3960            if commands:
 3961
 3962                # Export VCF file
 3963                self.export_variant_vcf(
 3964                    vcf_file=tmp_vcf_name,
 3965                    remove_info=True,
 3966                    add_samples=False,
 3967                    index=True,
 3968                )
 3969
 3970                # Threads
 3971                # calculate threads for annotated commands
 3972                if commands:
 3973                    threads_bcftools_annotate = round(threads / len(commands))
 3974                else:
 3975                    threads_bcftools_annotate = 1
 3976
 3977                if not threads_bcftools_annotate:
 3978                    threads_bcftools_annotate = 1
 3979
 3980                # Add threads option to bcftools commands
 3981                if threads_bcftools_annotate > 1:
 3982                    commands_threaded = []
 3983                    for command in commands:
 3984                        commands_threaded.append(
 3985                            command.replace(
 3986                                f"{bcftools_bin_command} annotate ",
 3987                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3988                            )
 3989                        )
 3990                    commands = commands_threaded
 3991
 3992                # Command annotation multithreading
 3993                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3994                log.info(
 3995                    f"Annotation - Annotation multithreaded in "
 3996                    + str(len(commands))
 3997                    + " commands"
 3998                )
 3999
 4000                run_parallel_commands(commands, threads)
 4001
 4002                # Merge
 4003                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4004
 4005                if tmp_ann_vcf_list_cmd:
 4006
 4007                    # Tmp file
 4008                    tmp_annotate_vcf = NamedTemporaryFile(
 4009                        prefix=self.get_prefix(),
 4010                        dir=self.get_tmp_dir(),
 4011                        suffix=".vcf.gz",
 4012                        delete=True,
 4013                    )
 4014                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4015                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4016                    err_files.append(tmp_annotate_vcf_name_err)
 4017
 4018                    # Tmp file remove command
 4019                    tmp_files_remove_command = ""
 4020                    if tmp_files:
 4021                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4022
 4023                    # Command merge
 4024                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4025                    log.info(
 4026                        f"Annotation - Annotation merging "
 4027                        + str(len(commands))
 4028                        + " annotated files"
 4029                    )
 4030                    log.debug(f"Annotation - merge command: {merge_command}")
 4031                    run_parallel_commands([merge_command], 1)
 4032
 4033                    # Error messages
 4034                    log.info(f"Error/Warning messages:")
 4035                    error_message_command_all = []
 4036                    error_message_command_warning = []
 4037                    error_message_command_err = []
 4038                    for err_file in err_files:
 4039                        with open(err_file, "r") as f:
 4040                            for line in f:
 4041                                message = line.strip()
 4042                                error_message_command_all.append(message)
 4043                                if line.startswith("[W::"):
 4044                                    error_message_command_warning.append(message)
 4045                                if line.startswith("[E::"):
 4046                                    error_message_command_err.append(
 4047                                        f"{err_file}: " + message
 4048                                    )
 4049                    # log info
 4050                    for message in list(
 4051                        set(error_message_command_err + error_message_command_warning)
 4052                    ):
 4053                        log.info(f"   {message}")
 4054                    # debug info
 4055                    for message in list(set(error_message_command_all)):
 4056                        log.debug(f"   {message}")
 4057                    # failed
 4058                    if len(error_message_command_err):
 4059                        log.error("Annotation failed: Error in commands")
 4060                        raise ValueError("Annotation failed: Error in commands")
 4061
 4062                    # Update variants
 4063                    log.info(f"Annotation - Updating...")
 4064                    self.update_from_vcf(tmp_annotate_vcf_name)
 4065
 4066    def annotation_exomiser(self, threads: int = None) -> None:
 4067        """
 4068        This function annotate with Exomiser
 4069
 4070        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4071        - "analysis" (dict/file):
 4072            Full analysis dictionnary parameters (see Exomiser docs).
 4073            Either a dict, or a file in JSON or YAML format.
 4074            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4075            Default : None
 4076        - "preset" (string):
 4077            Analysis preset (available in config folder).
 4078            Used if no full "analysis" is provided.
 4079            Default: "exome"
 4080        - "phenopacket" (dict/file):
 4081            Samples and phenotipic features parameters (see Exomiser docs).
 4082            Either a dict, or a file in JSON or YAML format.
 4083            Default: None
 4084        - "subject" (dict):
 4085            Sample parameters (see Exomiser docs).
 4086            Example:
 4087                "subject":
 4088                    {
 4089                        "id": "ISDBM322017",
 4090                        "sex": "FEMALE"
 4091                    }
 4092            Default: None
 4093        - "sample" (string):
 4094            Sample name to construct "subject" section:
 4095                "subject":
 4096                    {
 4097                        "id": "<sample>",
 4098                        "sex": "UNKNOWN_SEX"
 4099                    }
 4100            Default: None
 4101        - "phenotypicFeatures" (dict)
 4102            Phenotypic features to construct "subject" section.
 4103            Example:
 4104                "phenotypicFeatures":
 4105                    [
 4106                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4107                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4108                    ]
 4109        - "hpo" (list)
 4110            List of HPO ids as phenotypic features.
 4111            Example:
 4112                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4113            Default: []
 4114        - "outputOptions" (dict):
 4115            Output options (see Exomiser docs).
 4116            Default:
 4117                "output_options" =
 4118                    {
 4119                        "outputContributingVariantsOnly": False,
 4120                        "numGenes": 0,
 4121                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4122                    }
 4123        - "transcript_source" (string):
 4124            Transcript source (either "refseq", "ucsc", "ensembl")
 4125            Default: "refseq"
 4126        - "exomiser_to_info" (boolean):
 4127            Add exomiser TSV file columns as INFO fields in VCF.
 4128            Default: False
 4129        - "release" (string):
 4130            Exomise database release.
 4131            If not exists, database release will be downloaded (take a while).
 4132            Default: None (provided by application.properties configuration file)
 4133        - "exomiser_application_properties" (file):
 4134            Exomiser configuration file (see Exomiser docs).
 4135            Useful to automatically download databases (especially for specific genome databases).
 4136
 4137        Notes:
 4138        - If no sample in parameters, first sample in VCF will be chosen
 4139        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4140
 4141        :param threads: The number of threads to use
 4142        :return: None.
 4143        """
 4144
 4145        # DEBUG
 4146        log.debug("Start annotation with Exomiser databases")
 4147
 4148        # Threads
 4149        if not threads:
 4150            threads = self.get_threads()
 4151        log.debug("Threads: " + str(threads))
 4152
 4153        # Config
 4154        config = self.get_config()
 4155        log.debug("Config: " + str(config))
 4156
 4157        # Config - Folders - Databases
 4158        databases_folders = (
 4159            config.get("folders", {})
 4160            .get("databases", {})
 4161            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4162        )
 4163        databases_folders = full_path(databases_folders)
 4164        if not os.path.exists(databases_folders):
 4165            log.error(f"Databases annotations: {databases_folders} NOT found")
 4166        log.debug("Databases annotations: " + str(databases_folders))
 4167
 4168        # Config - Exomiser
 4169        exomiser_bin_command = get_bin_command(
 4170            bin="exomiser-cli*.jar",
 4171            tool="exomiser",
 4172            bin_type="jar",
 4173            config=config,
 4174            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4175        )
 4176        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4177        if not exomiser_bin_command:
 4178            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4179            log.error(msg_err)
 4180            raise ValueError(msg_err)
 4181
 4182        # Param
 4183        param = self.get_param()
 4184        log.debug("Param: " + str(param))
 4185
 4186        # Param - Exomiser
 4187        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4188        log.debug(f"Param Exomiser: {param_exomiser}")
 4189
 4190        # Param - Assembly
 4191        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4192        log.debug("Assembly: " + str(assembly))
 4193
 4194        # Data
 4195        table_variants = self.get_table_variants()
 4196
 4197        # Check if not empty
 4198        log.debug("Check if not empty")
 4199        sql_query_chromosomes = (
 4200            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4201        )
 4202        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4203            log.info(f"VCF empty")
 4204            return False
 4205
 4206        # VCF header
 4207        vcf_reader = self.get_header()
 4208        log.debug("Initial header: " + str(vcf_reader.infos))
 4209
 4210        # Samples
 4211        samples = self.get_header_sample_list()
 4212        if not samples:
 4213            log.error("No Samples in VCF")
 4214            return False
 4215        log.debug(f"Samples: {samples}")
 4216
 4217        # Memory limit
 4218        memory_limit = self.get_memory("8G")
 4219        log.debug(f"memory_limit: {memory_limit}")
 4220
 4221        # Exomiser java options
 4222        exomiser_java_options = (
 4223            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4224        )
 4225        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4226
 4227        # Download Exomiser (if not exists)
 4228        exomiser_release = param_exomiser.get("release", None)
 4229        exomiser_application_properties = param_exomiser.get(
 4230            "exomiser_application_properties", None
 4231        )
 4232        databases_download_exomiser(
 4233            assemblies=[assembly],
 4234            exomiser_folder=databases_folders,
 4235            exomiser_release=exomiser_release,
 4236            exomiser_phenotype_release=exomiser_release,
 4237            exomiser_application_properties=exomiser_application_properties,
 4238        )
 4239
 4240        # Force annotation
 4241        force_update_annotation = True
 4242
 4243        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4244            log.debug("Start annotation Exomiser")
 4245
 4246            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4247
 4248                # tmp_dir = "/tmp/exomiser"
 4249
 4250                ### ANALYSIS ###
 4251                ################
 4252
 4253                # Create analysis.json through analysis dict
 4254                # either analysis in param or by default
 4255                # depending on preset exome/genome)
 4256
 4257                # Init analysis dict
 4258                param_exomiser_analysis_dict = {}
 4259
 4260                # analysis from param
 4261                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4262                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4263
 4264                # If analysis in param -> load anlaysis json
 4265                if param_exomiser_analysis:
 4266
 4267                    # If param analysis is a file and exists
 4268                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4269                        param_exomiser_analysis
 4270                    ):
 4271                        # Load analysis file into analysis dict (either yaml or json)
 4272                        with open(param_exomiser_analysis) as json_file:
 4273                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4274
 4275                    # If param analysis is a dict
 4276                    elif isinstance(param_exomiser_analysis, dict):
 4277                        # Load analysis dict into analysis dict (either yaml or json)
 4278                        param_exomiser_analysis_dict = param_exomiser_analysis
 4279
 4280                    # Error analysis type
 4281                    else:
 4282                        log.error(f"Analysis type unknown. Check param file.")
 4283                        raise ValueError(f"Analysis type unknown. Check param file.")
 4284
 4285                # Case no input analysis config file/dict
 4286                # Use preset (exome/genome) to open default config file
 4287                if not param_exomiser_analysis_dict:
 4288
 4289                    # default preset
 4290                    default_preset = "exome"
 4291
 4292                    # Get param preset or default preset
 4293                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4294
 4295                    # Try to find if preset is a file
 4296                    if os.path.exists(param_exomiser_preset):
 4297                        # Preset file is provided in full path
 4298                        param_exomiser_analysis_default_config_file = (
 4299                            param_exomiser_preset
 4300                        )
 4301                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4302                    #     # Preset file is provided in full path
 4303                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4304                    elif os.path.exists(
 4305                        os.path.join(folder_config, param_exomiser_preset)
 4306                    ):
 4307                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4308                        param_exomiser_analysis_default_config_file = os.path.join(
 4309                            folder_config, param_exomiser_preset
 4310                        )
 4311                    else:
 4312                        # Construct preset file
 4313                        param_exomiser_analysis_default_config_file = os.path.join(
 4314                            folder_config,
 4315                            f"preset-{param_exomiser_preset}-analysis.json",
 4316                        )
 4317
 4318                    # If preset file exists
 4319                    param_exomiser_analysis_default_config_file = full_path(
 4320                        param_exomiser_analysis_default_config_file
 4321                    )
 4322                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4323                        # Load prest file into analysis dict (either yaml or json)
 4324                        with open(
 4325                            param_exomiser_analysis_default_config_file
 4326                        ) as json_file:
 4327                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4328                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4329                                json_file
 4330                            )
 4331
 4332                    # Error preset file
 4333                    else:
 4334                        log.error(
 4335                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4336                        )
 4337                        raise ValueError(
 4338                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4339                        )
 4340
 4341                # If no analysis dict created
 4342                if not param_exomiser_analysis_dict:
 4343                    log.error(f"No analysis config")
 4344                    raise ValueError(f"No analysis config")
 4345
 4346                # Log
 4347                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4348
 4349                ### PHENOPACKET ###
 4350                ###################
 4351
 4352                # If no PhenoPacket in analysis dict -> check in param
 4353                if "phenopacket" not in param_exomiser_analysis_dict:
 4354
 4355                    # If PhenoPacket in param -> load anlaysis json
 4356                    if param_exomiser.get("phenopacket", None):
 4357
 4358                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4359                        param_exomiser_phenopacket = full_path(
 4360                            param_exomiser_phenopacket
 4361                        )
 4362
 4363                        # If param phenopacket is a file and exists
 4364                        if isinstance(
 4365                            param_exomiser_phenopacket, str
 4366                        ) and os.path.exists(param_exomiser_phenopacket):
 4367                            # Load phenopacket file into analysis dict (either yaml or json)
 4368                            with open(param_exomiser_phenopacket) as json_file:
 4369                                param_exomiser_analysis_dict["phenopacket"] = (
 4370                                    yaml.safe_load(json_file)
 4371                                )
 4372
 4373                        # If param phenopacket is a dict
 4374                        elif isinstance(param_exomiser_phenopacket, dict):
 4375                            # Load phenopacket dict into analysis dict (either yaml or json)
 4376                            param_exomiser_analysis_dict["phenopacket"] = (
 4377                                param_exomiser_phenopacket
 4378                            )
 4379
 4380                        # Error phenopacket type
 4381                        else:
 4382                            log.error(f"Phenopacket type unknown. Check param file.")
 4383                            raise ValueError(
 4384                                f"Phenopacket type unknown. Check param file."
 4385                            )
 4386
 4387                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4388                if "phenopacket" not in param_exomiser_analysis_dict:
 4389
 4390                    # Init PhenoPacket
 4391                    param_exomiser_analysis_dict["phenopacket"] = {
 4392                        "id": "analysis",
 4393                        "proband": {},
 4394                    }
 4395
 4396                    ### Add subject ###
 4397
 4398                    # If subject exists
 4399                    param_exomiser_subject = param_exomiser.get("subject", {})
 4400
 4401                    # If subject not exists -> found sample ID
 4402                    if not param_exomiser_subject:
 4403
 4404                        # Found sample ID in param
 4405                        sample = param_exomiser.get("sample", None)
 4406
 4407                        # Find sample ID (first sample)
 4408                        if not sample:
 4409                            sample_list = self.get_header_sample_list()
 4410                            if len(sample_list) > 0:
 4411                                sample = sample_list[0]
 4412                            else:
 4413                                log.error(f"No sample found")
 4414                                raise ValueError(f"No sample found")
 4415
 4416                        # Create subject
 4417                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4418
 4419                    # Add to dict
 4420                    param_exomiser_analysis_dict["phenopacket"][
 4421                        "subject"
 4422                    ] = param_exomiser_subject
 4423
 4424                    ### Add "phenotypicFeatures" ###
 4425
 4426                    # If phenotypicFeatures exists
 4427                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4428                        "phenotypicFeatures", []
 4429                    )
 4430
 4431                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4432                    if not param_exomiser_phenotypicfeatures:
 4433
 4434                        # Found HPO in param
 4435                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4436
 4437                        # Split HPO if list in string format separated by comma
 4438                        if isinstance(param_exomiser_hpo, str):
 4439                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4440
 4441                        # Create HPO list
 4442                        for hpo in param_exomiser_hpo:
 4443                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4444                            param_exomiser_phenotypicfeatures.append(
 4445                                {
 4446                                    "type": {
 4447                                        "id": f"HP:{hpo_clean}",
 4448                                        "label": f"HP:{hpo_clean}",
 4449                                    }
 4450                                }
 4451                            )
 4452
 4453                    # Add to dict
 4454                    param_exomiser_analysis_dict["phenopacket"][
 4455                        "phenotypicFeatures"
 4456                    ] = param_exomiser_phenotypicfeatures
 4457
 4458                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4459                    if not param_exomiser_phenotypicfeatures:
 4460                        for step in param_exomiser_analysis_dict.get(
 4461                            "analysis", {}
 4462                        ).get("steps", []):
 4463                            if "hiPhivePrioritiser" in step:
 4464                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4465                                    "steps", []
 4466                                ).remove(step)
 4467
 4468                ### Add Input File ###
 4469
 4470                # Initial file name and htsFiles
 4471                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4472                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4473                    {
 4474                        "uri": tmp_vcf_name,
 4475                        "htsFormat": "VCF",
 4476                        "genomeAssembly": assembly,
 4477                    }
 4478                ]
 4479
 4480                ### Add metaData ###
 4481
 4482                # If metaData not in analysis dict
 4483                if "metaData" not in param_exomiser_analysis_dict:
 4484                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4485                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4486                        "createdBy": "howard",
 4487                        "phenopacketSchemaVersion": 1,
 4488                    }
 4489
 4490                ### OutputOptions ###
 4491
 4492                # Init output result folder
 4493                output_results = os.path.join(tmp_dir, "results")
 4494
 4495                # If no outputOptions in analysis dict
 4496                if "outputOptions" not in param_exomiser_analysis_dict:
 4497
 4498                    # default output formats
 4499                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4500
 4501                    # Get outputOptions in param
 4502                    output_options = param_exomiser.get("outputOptions", None)
 4503
 4504                    # If no output_options in param -> check
 4505                    if not output_options:
 4506                        output_options = {
 4507                            "outputContributingVariantsOnly": False,
 4508                            "numGenes": 0,
 4509                            "outputFormats": defaut_output_formats,
 4510                        }
 4511
 4512                    # Replace outputDirectory in output options
 4513                    output_options["outputDirectory"] = output_results
 4514                    output_options["outputFileName"] = "howard"
 4515
 4516                    # Add outputOptions in analysis dict
 4517                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4518
 4519                else:
 4520
 4521                    # Replace output_results and output format (if exists in param)
 4522                    param_exomiser_analysis_dict["outputOptions"][
 4523                        "outputDirectory"
 4524                    ] = output_results
 4525                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4526                        list(
 4527                            set(
 4528                                param_exomiser_analysis_dict.get(
 4529                                    "outputOptions", {}
 4530                                ).get("outputFormats", [])
 4531                                + ["TSV_VARIANT", "VCF"]
 4532                            )
 4533                        )
 4534                    )
 4535
 4536                # log
 4537                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4538
 4539                ### ANALYSIS FILE ###
 4540                #####################
 4541
 4542                ### Full JSON analysis config file ###
 4543
 4544                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4545                with open(exomiser_analysis, "w") as fp:
 4546                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4547
 4548                ### SPLIT analysis and sample config files
 4549
 4550                # Splitted analysis dict
 4551                param_exomiser_analysis_dict_for_split = (
 4552                    param_exomiser_analysis_dict.copy()
 4553                )
 4554
 4555                # Phenopacket JSON file
 4556                exomiser_analysis_phenopacket = os.path.join(
 4557                    tmp_dir, "analysis_phenopacket.json"
 4558                )
 4559                with open(exomiser_analysis_phenopacket, "w") as fp:
 4560                    json.dump(
 4561                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4562                        fp,
 4563                        indent=4,
 4564                    )
 4565
 4566                # Analysis JSON file without Phenopacket parameters
 4567                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4568                exomiser_analysis_analysis = os.path.join(
 4569                    tmp_dir, "analysis_analysis.json"
 4570                )
 4571                with open(exomiser_analysis_analysis, "w") as fp:
 4572                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4573
 4574                ### INITAL VCF file ###
 4575                #######################
 4576
 4577                ### Create list of samples to use and include inti initial VCF file ####
 4578
 4579                # Subject (main sample)
 4580                # Get sample ID in analysis dict
 4581                sample_subject = (
 4582                    param_exomiser_analysis_dict.get("phenopacket", {})
 4583                    .get("subject", {})
 4584                    .get("id", None)
 4585                )
 4586                sample_proband = (
 4587                    param_exomiser_analysis_dict.get("phenopacket", {})
 4588                    .get("proband", {})
 4589                    .get("subject", {})
 4590                    .get("id", None)
 4591                )
 4592                sample = []
 4593                if sample_subject:
 4594                    sample.append(sample_subject)
 4595                if sample_proband:
 4596                    sample.append(sample_proband)
 4597
 4598                # Get sample ID within Pedigree
 4599                pedigree_persons_list = (
 4600                    param_exomiser_analysis_dict.get("phenopacket", {})
 4601                    .get("pedigree", {})
 4602                    .get("persons", {})
 4603                )
 4604
 4605                # Create list with all sample ID in pedigree (if exists)
 4606                pedigree_persons = []
 4607                for person in pedigree_persons_list:
 4608                    pedigree_persons.append(person.get("individualId"))
 4609
 4610                # Concat subject sample ID and samples ID in pedigreesamples
 4611                samples = list(set(sample + pedigree_persons))
 4612
 4613                # Check if sample list is not empty
 4614                if not samples:
 4615                    log.error(f"No samples found")
 4616                    raise ValueError(f"No samples found")
 4617
 4618                # Create VCF with sample (either sample in param or first one by default)
 4619                # Export VCF file
 4620                self.export_variant_vcf(
 4621                    vcf_file=tmp_vcf_name,
 4622                    remove_info=True,
 4623                    add_samples=True,
 4624                    list_samples=samples,
 4625                    index=False,
 4626                )
 4627
 4628                ### Execute Exomiser ###
 4629                ########################
 4630
 4631                # Init command
 4632                exomiser_command = ""
 4633
 4634                # Command exomiser options
 4635                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4636
 4637                # Release
 4638                exomiser_release = param_exomiser.get("release", None)
 4639                if exomiser_release:
 4640                    # phenotype data version
 4641                    exomiser_options += (
 4642                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4643                    )
 4644                    # data version
 4645                    exomiser_options += (
 4646                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4647                    )
 4648                    # variant white list
 4649                    variant_white_list_file = (
 4650                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4651                    )
 4652                    if os.path.exists(
 4653                        os.path.join(
 4654                            databases_folders, assembly, variant_white_list_file
 4655                        )
 4656                    ):
 4657                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4658
 4659                # transcript_source
 4660                transcript_source = param_exomiser.get(
 4661                    "transcript_source", None
 4662                )  # ucsc, refseq, ensembl
 4663                if transcript_source:
 4664                    exomiser_options += (
 4665                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4666                    )
 4667
 4668                # If analysis contain proband param
 4669                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4670                    "proband", {}
 4671                ):
 4672                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4673
 4674                # If no proband (usually uniq sample)
 4675                else:
 4676                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4677
 4678                # Log
 4679                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4680
 4681                # Run command
 4682                result = subprocess.call(
 4683                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4684                )
 4685                if result:
 4686                    log.error("Exomiser command failed")
 4687                    raise ValueError("Exomiser command failed")
 4688
 4689                ### RESULTS ###
 4690                ###############
 4691
 4692                ### Annotate with TSV fields ###
 4693
 4694                # Init result tsv file
 4695                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4696
 4697                # Init result tsv file
 4698                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4699
 4700                # Parse TSV file and explode columns in INFO field
 4701                if exomiser_to_info and os.path.exists(output_results_tsv):
 4702
 4703                    # Log
 4704                    log.debug("Exomiser columns to VCF INFO field")
 4705
 4706                    # Retrieve columns and types
 4707                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4708                    output_results_tsv_df = self.get_query_to_df(query)
 4709                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4710
 4711                    # Init concat fields for update
 4712                    sql_query_update_concat_fields = []
 4713
 4714                    # Fields to avoid
 4715                    fields_to_avoid = [
 4716                        "CONTIG",
 4717                        "START",
 4718                        "END",
 4719                        "REF",
 4720                        "ALT",
 4721                        "QUAL",
 4722                        "FILTER",
 4723                        "GENOTYPE",
 4724                    ]
 4725
 4726                    # List all columns to add into header
 4727                    for header_column in output_results_tsv_columns:
 4728
 4729                        # If header column is enable
 4730                        if header_column not in fields_to_avoid:
 4731
 4732                            # Header info type
 4733                            header_info_type = "String"
 4734                            header_column_df = output_results_tsv_df[header_column]
 4735                            header_column_df_dtype = header_column_df.dtype
 4736                            if header_column_df_dtype == object:
 4737                                if (
 4738                                    pd.to_numeric(header_column_df, errors="coerce")
 4739                                    .notnull()
 4740                                    .all()
 4741                                ):
 4742                                    header_info_type = "Float"
 4743                            else:
 4744                                header_info_type = "Integer"
 4745
 4746                            # Header info
 4747                            characters_to_validate = ["-"]
 4748                            pattern = "[" + "".join(characters_to_validate) + "]"
 4749                            header_info_name = re.sub(
 4750                                pattern,
 4751                                "_",
 4752                                f"Exomiser_{header_column}".replace("#", ""),
 4753                            )
 4754                            header_info_number = "."
 4755                            header_info_description = (
 4756                                f"Exomiser {header_column} annotation"
 4757                            )
 4758                            header_info_source = "Exomiser"
 4759                            header_info_version = "unknown"
 4760                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4761                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4762                                header_info_name,
 4763                                header_info_number,
 4764                                header_info_type,
 4765                                header_info_description,
 4766                                header_info_source,
 4767                                header_info_version,
 4768                                header_info_code,
 4769                            )
 4770
 4771                            # Add field to add for update to concat fields
 4772                            sql_query_update_concat_fields.append(
 4773                                f"""
 4774                                CASE
 4775                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4776                                    THEN concat(
 4777                                        '{header_info_name}=',
 4778                                        table_parquet."{header_column}",
 4779                                        ';'
 4780                                        )
 4781
 4782                                    ELSE ''
 4783                                END
 4784                            """
 4785                            )
 4786
 4787                    # Update query
 4788                    sql_query_update = f"""
 4789                        UPDATE {table_variants} as table_variants
 4790                            SET INFO = concat(
 4791                                            CASE
 4792                                                WHEN INFO NOT IN ('', '.')
 4793                                                THEN INFO
 4794                                                ELSE ''
 4795                                            END,
 4796                                            CASE
 4797                                                WHEN table_variants.INFO NOT IN ('','.')
 4798                                                THEN ';'
 4799                                                ELSE ''
 4800                                            END,
 4801                                            (
 4802                                            SELECT 
 4803                                                concat(
 4804                                                    {",".join(sql_query_update_concat_fields)}
 4805                                                )
 4806                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4807                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4808                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4809                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4810                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4811                                            )
 4812                                        )
 4813                            ;
 4814                        """
 4815
 4816                    # Update
 4817                    self.conn.execute(sql_query_update)
 4818
 4819                ### Annotate with VCF INFO field ###
 4820
 4821                # Init result VCF file
 4822                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4823
 4824                # If VCF exists
 4825                if os.path.exists(output_results_vcf):
 4826
 4827                    # Log
 4828                    log.debug("Exomiser result VCF update variants")
 4829
 4830                    # Find Exomiser INFO field annotation in header
 4831                    with gzip.open(output_results_vcf, "rt") as f:
 4832                        header_list = self.read_vcf_header(f)
 4833                    exomiser_vcf_header = vcf.Reader(
 4834                        io.StringIO("\n".join(header_list))
 4835                    )
 4836
 4837                    # Add annotation INFO field to header
 4838                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4839
 4840                    # Update variants with VCF
 4841                    self.update_from_vcf(output_results_vcf)
 4842
 4843        return True
 4844
 4845    def annotation_snpeff(self, threads: int = None) -> None:
 4846        """
 4847        This function annotate with snpEff
 4848
 4849        :param threads: The number of threads to use
 4850        :return: the value of the variable "return_value".
 4851        """
 4852
 4853        # DEBUG
 4854        log.debug("Start annotation with snpeff databases")
 4855
 4856        # Threads
 4857        if not threads:
 4858            threads = self.get_threads()
 4859        log.debug("Threads: " + str(threads))
 4860
 4861        # DEBUG
 4862        delete_tmp = True
 4863        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4864            delete_tmp = False
 4865            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4866
 4867        # Config
 4868        config = self.get_config()
 4869        log.debug("Config: " + str(config))
 4870
 4871        # Config - Folders - Databases
 4872        databases_folders = (
 4873            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4874        )
 4875        log.debug("Databases annotations: " + str(databases_folders))
 4876
 4877        # # Config - Java
 4878        # java_bin = get_bin(
 4879        #     tool="java",
 4880        #     bin="java",
 4881        #     bin_type="bin",
 4882        #     config=config,
 4883        #     default_folder="/usr/bin",
 4884        # )
 4885        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4886        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4887        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4888
 4889        # # Config - snpEff bin
 4890        # snpeff_jar = get_bin(
 4891        #     tool="snpeff",
 4892        #     bin="snpEff.jar",
 4893        #     bin_type="jar",
 4894        #     config=config,
 4895        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4896        # )
 4897        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4898        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4899        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4900
 4901        # Config - snpEff bin command
 4902        snpeff_bin_command = get_bin_command(
 4903            bin="snpEff.jar",
 4904            tool="snpeff",
 4905            bin_type="jar",
 4906            config=config,
 4907            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4908        )
 4909        if not snpeff_bin_command:
 4910            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4911            log.error(msg_err)
 4912            raise ValueError(msg_err)
 4913
 4914        # Config - snpEff databases
 4915        snpeff_databases = (
 4916            config.get("folders", {})
 4917            .get("databases", {})
 4918            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4919        )
 4920        snpeff_databases = full_path(snpeff_databases)
 4921        if snpeff_databases is not None and snpeff_databases != "":
 4922            log.debug(f"Create snpEff databases folder")
 4923            if not os.path.exists(snpeff_databases):
 4924                os.makedirs(snpeff_databases)
 4925
 4926        # Param
 4927        param = self.get_param()
 4928        log.debug("Param: " + str(param))
 4929
 4930        # Param
 4931        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4932        log.debug("Options: " + str(options))
 4933
 4934        # Param - Assembly
 4935        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4936
 4937        # Param - Options
 4938        snpeff_options = (
 4939            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4940        )
 4941        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4942        snpeff_csvstats = (
 4943            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4944        )
 4945        if snpeff_stats:
 4946            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4947            snpeff_stats = full_path(snpeff_stats)
 4948            snpeff_options += f" -stats {snpeff_stats}"
 4949        if snpeff_csvstats:
 4950            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4951            snpeff_csvstats = full_path(snpeff_csvstats)
 4952            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4953
 4954        # Data
 4955        table_variants = self.get_table_variants()
 4956
 4957        # Check if not empty
 4958        log.debug("Check if not empty")
 4959        sql_query_chromosomes = (
 4960            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4961        )
 4962        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4963        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4964            log.info(f"VCF empty")
 4965            return
 4966
 4967        # Export in VCF
 4968        log.debug("Create initial file to annotate")
 4969        tmp_vcf = NamedTemporaryFile(
 4970            prefix=self.get_prefix(),
 4971            dir=self.get_tmp_dir(),
 4972            suffix=".vcf.gz",
 4973            delete=True,
 4974        )
 4975        tmp_vcf_name = tmp_vcf.name
 4976
 4977        # VCF header
 4978        vcf_reader = self.get_header()
 4979        log.debug("Initial header: " + str(vcf_reader.infos))
 4980
 4981        # Existing annotations
 4982        for vcf_annotation in self.get_header().infos:
 4983
 4984            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4985            log.debug(
 4986                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4987            )
 4988
 4989        # Memory limit
 4990        # if config.get("memory", None):
 4991        #     memory_limit = config.get("memory", "8G")
 4992        # else:
 4993        #     memory_limit = "8G"
 4994        memory_limit = self.get_memory("8G")
 4995        log.debug(f"memory_limit: {memory_limit}")
 4996
 4997        # snpEff java options
 4998        snpeff_java_options = (
 4999            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5000        )
 5001        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5002
 5003        force_update_annotation = True
 5004
 5005        if "ANN" not in self.get_header().infos or force_update_annotation:
 5006
 5007            # Check snpEff database
 5008            log.debug(f"Check snpEff databases {[assembly]}")
 5009            databases_download_snpeff(
 5010                folder=snpeff_databases, assemblies=[assembly], config=config
 5011            )
 5012
 5013            # Export VCF file
 5014            self.export_variant_vcf(
 5015                vcf_file=tmp_vcf_name,
 5016                remove_info=True,
 5017                add_samples=False,
 5018                index=True,
 5019            )
 5020
 5021            # Tmp file
 5022            err_files = []
 5023            tmp_annotate_vcf = NamedTemporaryFile(
 5024                prefix=self.get_prefix(),
 5025                dir=self.get_tmp_dir(),
 5026                suffix=".vcf",
 5027                delete=False,
 5028            )
 5029            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5030            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5031            err_files.append(tmp_annotate_vcf_name_err)
 5032
 5033            # Command
 5034            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5035            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5036            run_parallel_commands([snpeff_command], 1)
 5037
 5038            # Error messages
 5039            log.info(f"Error/Warning messages:")
 5040            error_message_command_all = []
 5041            error_message_command_warning = []
 5042            error_message_command_err = []
 5043            for err_file in err_files:
 5044                with open(err_file, "r") as f:
 5045                    for line in f:
 5046                        message = line.strip()
 5047                        error_message_command_all.append(message)
 5048                        if line.startswith("[W::"):
 5049                            error_message_command_warning.append(message)
 5050                        if line.startswith("[E::"):
 5051                            error_message_command_err.append(f"{err_file}: " + message)
 5052            # log info
 5053            for message in list(
 5054                set(error_message_command_err + error_message_command_warning)
 5055            ):
 5056                log.info(f"   {message}")
 5057            # debug info
 5058            for message in list(set(error_message_command_all)):
 5059                log.debug(f"   {message}")
 5060            # failed
 5061            if len(error_message_command_err):
 5062                log.error("Annotation failed: Error in commands")
 5063                raise ValueError("Annotation failed: Error in commands")
 5064
 5065            # Find annotation in header
 5066            with open(tmp_annotate_vcf_name, "rt") as f:
 5067                header_list = self.read_vcf_header(f)
 5068            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5069
 5070            for ann in annovar_vcf_header.infos:
 5071                if ann not in self.get_header().infos:
 5072                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5073
 5074            # Update variants
 5075            log.info(f"Annotation - Updating...")
 5076            self.update_from_vcf(tmp_annotate_vcf_name)
 5077
 5078        else:
 5079            if "ANN" in self.get_header().infos:
 5080                log.debug(f"Existing snpEff annotations in VCF")
 5081            if force_update_annotation:
 5082                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5083
 5084    def annotation_annovar(self, threads: int = None) -> None:
 5085        """
 5086        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5087        annotations
 5088
 5089        :param threads: number of threads to use
 5090        :return: the value of the variable "return_value".
 5091        """
 5092
 5093        # DEBUG
 5094        log.debug("Start annotation with Annovar databases")
 5095
 5096        # Threads
 5097        if not threads:
 5098            threads = self.get_threads()
 5099        log.debug("Threads: " + str(threads))
 5100
 5101        # Tmp en Err files
 5102        tmp_files = []
 5103        err_files = []
 5104
 5105        # DEBUG
 5106        delete_tmp = True
 5107        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5108            delete_tmp = False
 5109            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5110
 5111        # Config
 5112        config = self.get_config()
 5113        log.debug("Config: " + str(config))
 5114
 5115        # Config - Folders - Databases
 5116        databases_folders = (
 5117            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5118        )
 5119        log.debug("Databases annotations: " + str(databases_folders))
 5120
 5121        # Config - annovar bin command
 5122        annovar_bin_command = get_bin_command(
 5123            bin="table_annovar.pl",
 5124            tool="annovar",
 5125            bin_type="perl",
 5126            config=config,
 5127            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5128        )
 5129        if not annovar_bin_command:
 5130            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5131            log.error(msg_err)
 5132            raise ValueError(msg_err)
 5133
 5134        # Config - BCFTools bin command
 5135        bcftools_bin_command = get_bin_command(
 5136            bin="bcftools",
 5137            tool="bcftools",
 5138            bin_type="bin",
 5139            config=config,
 5140            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5141        )
 5142        if not bcftools_bin_command:
 5143            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5144            log.error(msg_err)
 5145            raise ValueError(msg_err)
 5146
 5147        # Config - annovar databases
 5148        annovar_databases = (
 5149            config.get("folders", {})
 5150            .get("databases", {})
 5151            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5152        )
 5153        annovar_databases = full_path(annovar_databases)
 5154        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5155            os.makedirs(annovar_databases)
 5156
 5157        # Param
 5158        param = self.get_param()
 5159        log.debug("Param: " + str(param))
 5160
 5161        # Param - options
 5162        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5163        log.debug("Options: " + str(options))
 5164
 5165        # Param - annotations
 5166        annotations = (
 5167            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5168        )
 5169        log.debug("Annotations: " + str(annotations))
 5170
 5171        # Param - Assembly
 5172        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5173
 5174        # Annovar database assembly
 5175        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5176        if annovar_databases_assembly != "" and not os.path.exists(
 5177            annovar_databases_assembly
 5178        ):
 5179            os.makedirs(annovar_databases_assembly)
 5180
 5181        # Data
 5182        table_variants = self.get_table_variants()
 5183
 5184        # Check if not empty
 5185        log.debug("Check if not empty")
 5186        sql_query_chromosomes = (
 5187            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5188        )
 5189        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5190        if not sql_query_chromosomes_df["count"][0]:
 5191            log.info(f"VCF empty")
 5192            return
 5193
 5194        # VCF header
 5195        vcf_reader = self.get_header()
 5196        log.debug("Initial header: " + str(vcf_reader.infos))
 5197
 5198        # Existing annotations
 5199        for vcf_annotation in self.get_header().infos:
 5200
 5201            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5202            log.debug(
 5203                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5204            )
 5205
 5206        force_update_annotation = True
 5207
 5208        if annotations:
 5209
 5210            commands = []
 5211            tmp_annotates_vcf_name_list = []
 5212
 5213            # Export in VCF
 5214            log.debug("Create initial file to annotate")
 5215            tmp_vcf = NamedTemporaryFile(
 5216                prefix=self.get_prefix(),
 5217                dir=self.get_tmp_dir(),
 5218                suffix=".vcf.gz",
 5219                delete=False,
 5220            )
 5221            tmp_vcf_name = tmp_vcf.name
 5222            tmp_files.append(tmp_vcf_name)
 5223            tmp_files.append(tmp_vcf_name + ".tbi")
 5224
 5225            # Export VCF file
 5226            self.export_variant_vcf(
 5227                vcf_file=tmp_vcf_name,
 5228                remove_info=".",
 5229                add_samples=False,
 5230                index=True,
 5231            )
 5232
 5233            # Create file for field rename
 5234            log.debug("Create file for field rename")
 5235            tmp_rename = NamedTemporaryFile(
 5236                prefix=self.get_prefix(),
 5237                dir=self.get_tmp_dir(),
 5238                suffix=".rename",
 5239                delete=False,
 5240            )
 5241            tmp_rename_name = tmp_rename.name
 5242            tmp_files.append(tmp_rename_name)
 5243
 5244            # Check Annovar database
 5245            log.debug(
 5246                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5247            )
 5248            databases_download_annovar(
 5249                folder=annovar_databases,
 5250                files=list(annotations.keys()),
 5251                assemblies=[assembly],
 5252            )
 5253
 5254            for annotation in annotations:
 5255                annotation_fields = annotations[annotation]
 5256
 5257                if not annotation_fields:
 5258                    annotation_fields = {"INFO": None}
 5259
 5260                log.info(f"Annotations Annovar - database '{annotation}'")
 5261                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5262
 5263                # Tmp file for annovar
 5264                err_files = []
 5265                tmp_annotate_vcf_directory = TemporaryDirectory(
 5266                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5267                )
 5268                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5269                tmp_annotate_vcf_name_annovar = (
 5270                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5271                )
 5272                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5273                err_files.append(tmp_annotate_vcf_name_err)
 5274                tmp_files.append(tmp_annotate_vcf_name_err)
 5275
 5276                # Tmp file final vcf annotated by annovar
 5277                tmp_annotate_vcf = NamedTemporaryFile(
 5278                    prefix=self.get_prefix(),
 5279                    dir=self.get_tmp_dir(),
 5280                    suffix=".vcf.gz",
 5281                    delete=False,
 5282                )
 5283                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5284                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5285                tmp_files.append(tmp_annotate_vcf_name)
 5286                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5287
 5288                # Number of fields
 5289                annotation_list = []
 5290                annotation_renamed_list = []
 5291
 5292                for annotation_field in annotation_fields:
 5293
 5294                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5295                    annotation_fields_new_name = annotation_fields.get(
 5296                        annotation_field, annotation_field
 5297                    )
 5298                    if not annotation_fields_new_name:
 5299                        annotation_fields_new_name = annotation_field
 5300
 5301                    if (
 5302                        force_update_annotation
 5303                        or annotation_fields_new_name not in self.get_header().infos
 5304                    ):
 5305                        annotation_list.append(annotation_field)
 5306                        annotation_renamed_list.append(annotation_fields_new_name)
 5307                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5308                        log.warning(
 5309                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5310                        )
 5311
 5312                    # Add rename info
 5313                    run_parallel_commands(
 5314                        [
 5315                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5316                        ],
 5317                        1,
 5318                    )
 5319
 5320                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5321                log.debug("annotation_list: " + str(annotation_list))
 5322
 5323                # protocol
 5324                protocol = annotation
 5325
 5326                # argument
 5327                argument = ""
 5328
 5329                # operation
 5330                operation = "f"
 5331                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5332                    "ensGene"
 5333                ):
 5334                    operation = "g"
 5335                    if options.get("genebase", None):
 5336                        argument = f"""'{options.get("genebase","")}'"""
 5337                elif annotation in ["cytoBand"]:
 5338                    operation = "r"
 5339
 5340                # argument option
 5341                argument_option = ""
 5342                if argument != "":
 5343                    argument_option = " --argument " + argument
 5344
 5345                # command options
 5346                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5347                for option in options:
 5348                    if option not in ["genebase"]:
 5349                        command_options += f""" --{option}={options[option]}"""
 5350
 5351                # Command
 5352
 5353                # Command - Annovar
 5354                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5355                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5356
 5357                # Command - start pipe
 5358                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5359
 5360                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5361                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5362
 5363                # Command - Special characters (refGene annotation)
 5364                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5365
 5366                # Command - Clean empty fields (with value ".")
 5367                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5368
 5369                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5370                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5371                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5372                    # for ann in annotation_renamed_list:
 5373                    for ann in annotation_list:
 5374                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5375
 5376                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5377
 5378                # Command - indexing
 5379                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5380
 5381                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5382                run_parallel_commands([command_annovar], 1)
 5383
 5384                # Error messages
 5385                log.info(f"Error/Warning messages:")
 5386                error_message_command_all = []
 5387                error_message_command_warning = []
 5388                error_message_command_err = []
 5389                for err_file in err_files:
 5390                    with open(err_file, "r") as f:
 5391                        for line in f:
 5392                            message = line.strip()
 5393                            error_message_command_all.append(message)
 5394                            if line.startswith("[W::") or line.startswith("WARNING"):
 5395                                error_message_command_warning.append(message)
 5396                            if line.startswith("[E::") or line.startswith("ERROR"):
 5397                                error_message_command_err.append(
 5398                                    f"{err_file}: " + message
 5399                                )
 5400                # log info
 5401                for message in list(
 5402                    set(error_message_command_err + error_message_command_warning)
 5403                ):
 5404                    log.info(f"   {message}")
 5405                # debug info
 5406                for message in list(set(error_message_command_all)):
 5407                    log.debug(f"   {message}")
 5408                # failed
 5409                if len(error_message_command_err):
 5410                    log.error("Annotation failed: Error in commands")
 5411                    raise ValueError("Annotation failed: Error in commands")
 5412
 5413            if tmp_annotates_vcf_name_list:
 5414
 5415                # List of annotated files
 5416                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5417
 5418                # Tmp file
 5419                tmp_annotate_vcf = NamedTemporaryFile(
 5420                    prefix=self.get_prefix(),
 5421                    dir=self.get_tmp_dir(),
 5422                    suffix=".vcf.gz",
 5423                    delete=False,
 5424                )
 5425                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5426                tmp_files.append(tmp_annotate_vcf_name)
 5427                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5428                err_files.append(tmp_annotate_vcf_name_err)
 5429                tmp_files.append(tmp_annotate_vcf_name_err)
 5430
 5431                # Command merge
 5432                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5433                log.info(
 5434                    f"Annotation Annovar - Annotation merging "
 5435                    + str(len(tmp_annotates_vcf_name_list))
 5436                    + " annotated files"
 5437                )
 5438                log.debug(f"Annotation - merge command: {merge_command}")
 5439                run_parallel_commands([merge_command], 1)
 5440
 5441                # Find annotation in header
 5442                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5443                    header_list = self.read_vcf_header(f)
 5444                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5445
 5446                for ann in annovar_vcf_header.infos:
 5447                    if ann not in self.get_header().infos:
 5448                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5449
 5450                # Update variants
 5451                log.info(f"Annotation Annovar - Updating...")
 5452                self.update_from_vcf(tmp_annotate_vcf_name)
 5453
 5454            # Clean files
 5455            # Tmp file remove command
 5456            if True:
 5457                tmp_files_remove_command = ""
 5458                if tmp_files:
 5459                    tmp_files_remove_command = " ".join(tmp_files)
 5460                clean_command = f" rm -f {tmp_files_remove_command} "
 5461                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5462                log.debug(f"Annotation - cleaning command: {clean_command}")
 5463                run_parallel_commands([clean_command], 1)
 5464
 5465    # Parquet
 5466    def annotation_parquet(self, threads: int = None) -> None:
 5467        """
 5468        It takes a VCF file, and annotates it with a parquet file
 5469
 5470        :param threads: number of threads to use for the annotation
 5471        :return: the value of the variable "result".
 5472        """
 5473
 5474        # DEBUG
 5475        log.debug("Start annotation with parquet databases")
 5476
 5477        # Threads
 5478        if not threads:
 5479            threads = self.get_threads()
 5480        log.debug("Threads: " + str(threads))
 5481
 5482        # DEBUG
 5483        delete_tmp = True
 5484        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5485            delete_tmp = False
 5486            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5487
 5488        # Config
 5489        databases_folders = set(
 5490            self.get_config()
 5491            .get("folders", {})
 5492            .get("databases", {})
 5493            .get("annotations", ["."])
 5494            + self.get_config()
 5495            .get("folders", {})
 5496            .get("databases", {})
 5497            .get("parquet", ["."])
 5498        )
 5499        log.debug("Databases annotations: " + str(databases_folders))
 5500
 5501        # Param
 5502        annotations = (
 5503            self.get_param()
 5504            .get("annotation", {})
 5505            .get("parquet", {})
 5506            .get("annotations", None)
 5507        )
 5508        log.debug("Annotations: " + str(annotations))
 5509
 5510        # Assembly
 5511        assembly = self.get_param().get(
 5512            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5513        )
 5514
 5515        # Force Update Annotation
 5516        force_update_annotation = (
 5517            self.get_param()
 5518            .get("annotation", {})
 5519            .get("options", {})
 5520            .get("annotations_update", False)
 5521        )
 5522        log.debug(f"force_update_annotation={force_update_annotation}")
 5523        force_append_annotation = (
 5524            self.get_param()
 5525            .get("annotation", {})
 5526            .get("options", {})
 5527            .get("annotations_append", False)
 5528        )
 5529        log.debug(f"force_append_annotation={force_append_annotation}")
 5530
 5531        # Data
 5532        table_variants = self.get_table_variants()
 5533
 5534        # Check if not empty
 5535        log.debug("Check if not empty")
 5536        sql_query_chromosomes_df = self.get_query_to_df(
 5537            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5538        )
 5539        if not sql_query_chromosomes_df["count"][0]:
 5540            log.info(f"VCF empty")
 5541            return
 5542
 5543        # VCF header
 5544        vcf_reader = self.get_header()
 5545        log.debug("Initial header: " + str(vcf_reader.infos))
 5546
 5547        # Nb Variants POS
 5548        log.debug("NB Variants Start")
 5549        nb_variants = self.conn.execute(
 5550            f"SELECT count(*) AS count FROM variants"
 5551        ).fetchdf()["count"][0]
 5552        log.debug("NB Variants Stop")
 5553
 5554        # Existing annotations
 5555        for vcf_annotation in self.get_header().infos:
 5556
 5557            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5558            log.debug(
 5559                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5560            )
 5561
 5562        # Added columns
 5563        added_columns = []
 5564
 5565        # drop indexes
 5566        log.debug(f"Drop indexes...")
 5567        self.drop_indexes()
 5568
 5569        if annotations:
 5570
 5571            if "ALL" in annotations:
 5572
 5573                all_param = annotations.get("ALL", {})
 5574                all_param_formats = all_param.get("formats", None)
 5575                all_param_releases = all_param.get("releases", None)
 5576
 5577                databases_infos_dict = self.scan_databases(
 5578                    database_formats=all_param_formats,
 5579                    database_releases=all_param_releases,
 5580                )
 5581                for database_infos in databases_infos_dict.keys():
 5582                    if database_infos not in annotations:
 5583                        annotations[database_infos] = {"INFO": None}
 5584
 5585            for annotation in annotations:
 5586
 5587                if annotation in ["ALL"]:
 5588                    continue
 5589
 5590                # Annotation Name
 5591                annotation_name = os.path.basename(annotation)
 5592
 5593                # Annotation fields
 5594                annotation_fields = annotations[annotation]
 5595                if not annotation_fields:
 5596                    annotation_fields = {"INFO": None}
 5597
 5598                log.debug(f"Annotation '{annotation_name}'")
 5599                log.debug(
 5600                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5601                )
 5602
 5603                # Create Database
 5604                database = Database(
 5605                    database=annotation,
 5606                    databases_folders=databases_folders,
 5607                    assembly=assembly,
 5608                )
 5609
 5610                # Find files
 5611                parquet_file = database.get_database()
 5612                parquet_hdr_file = database.get_header_file()
 5613                parquet_type = database.get_type()
 5614
 5615                # Check if files exists
 5616                if not parquet_file or not parquet_hdr_file:
 5617                    log.error("Annotation failed: file not found")
 5618                    raise ValueError("Annotation failed: file not found")
 5619                else:
 5620                    # Get parquet connexion
 5621                    parquet_sql_attach = database.get_sql_database_attach(
 5622                        output="query"
 5623                    )
 5624                    if parquet_sql_attach:
 5625                        self.conn.execute(parquet_sql_attach)
 5626                    parquet_file_link = database.get_sql_database_link()
 5627                    # Log
 5628                    log.debug(
 5629                        f"Annotation '{annotation_name}' - file: "
 5630                        + str(parquet_file)
 5631                        + " and "
 5632                        + str(parquet_hdr_file)
 5633                    )
 5634
 5635                    # Database full header columns
 5636                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5637                        parquet_hdr_file
 5638                    )
 5639                    # Log
 5640                    log.debug(
 5641                        "Annotation database header columns : "
 5642                        + str(parquet_hdr_vcf_header_columns)
 5643                    )
 5644
 5645                    # Load header as VCF object
 5646                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5647                    # Log
 5648                    log.debug(
 5649                        "Annotation database header: "
 5650                        + str(parquet_hdr_vcf_header_infos)
 5651                    )
 5652
 5653                    # Get extra infos
 5654                    parquet_columns = database.get_extra_columns()
 5655                    # Log
 5656                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5657
 5658                    # Add extra columns if "ALL" in annotation_fields
 5659                    # if "ALL" in annotation_fields:
 5660                    #     allow_add_extra_column = True
 5661                    if "ALL" in annotation_fields and database.get_extra_columns():
 5662                        for extra_column in database.get_extra_columns():
 5663                            if (
 5664                                extra_column not in annotation_fields
 5665                                and extra_column.replace("INFO/", "")
 5666                                not in parquet_hdr_vcf_header_infos
 5667                            ):
 5668                                parquet_hdr_vcf_header_infos[extra_column] = (
 5669                                    vcf.parser._Info(
 5670                                        extra_column,
 5671                                        ".",
 5672                                        "String",
 5673                                        f"{extra_column} description",
 5674                                        "unknown",
 5675                                        "unknown",
 5676                                        self.code_type_map["String"],
 5677                                    )
 5678                                )
 5679
 5680                    # For all fields in database
 5681                    annotation_fields_all = False
 5682                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5683                        annotation_fields_all = True
 5684                        annotation_fields = {
 5685                            key: key for key in parquet_hdr_vcf_header_infos
 5686                        }
 5687
 5688                        log.debug(
 5689                            "Annotation database header - All annotations added: "
 5690                            + str(annotation_fields)
 5691                        )
 5692
 5693                    # Init
 5694
 5695                    # List of annotation fields to use
 5696                    sql_query_annotation_update_info_sets = []
 5697
 5698                    # List of annotation to agregate
 5699                    sql_query_annotation_to_agregate = []
 5700
 5701                    # Number of fields
 5702                    nb_annotation_field = 0
 5703
 5704                    # Annotation fields processed
 5705                    annotation_fields_processed = []
 5706
 5707                    # Columns mapping
 5708                    map_columns = database.map_columns(
 5709                        columns=annotation_fields, prefixes=["INFO/"]
 5710                    )
 5711
 5712                    # Query dict for fields to remove (update option)
 5713                    query_dict_remove = {}
 5714
 5715                    # Fetch Anotation fields
 5716                    for annotation_field in annotation_fields:
 5717
 5718                        # annotation_field_column
 5719                        annotation_field_column = map_columns.get(
 5720                            annotation_field, "INFO"
 5721                        )
 5722
 5723                        # field new name, if parametered
 5724                        annotation_fields_new_name = annotation_fields.get(
 5725                            annotation_field, annotation_field
 5726                        )
 5727                        if not annotation_fields_new_name:
 5728                            annotation_fields_new_name = annotation_field
 5729
 5730                        # To annotate
 5731                        # force_update_annotation = True
 5732                        # force_append_annotation = True
 5733                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5734                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5735                            force_update_annotation
 5736                            or force_append_annotation
 5737                            or (
 5738                                annotation_fields_new_name
 5739                                not in self.get_header().infos
 5740                            )
 5741                        ):
 5742
 5743                            # Add field to annotation to process list
 5744                            annotation_fields_processed.append(
 5745                                annotation_fields_new_name
 5746                            )
 5747
 5748                            # explode infos for the field
 5749                            annotation_fields_new_name_info_msg = ""
 5750                            if (
 5751                                force_update_annotation
 5752                                and annotation_fields_new_name
 5753                                in self.get_header().infos
 5754                            ):
 5755                                # Remove field from INFO
 5756                                query = f"""
 5757                                    UPDATE {table_variants} as table_variants
 5758                                    SET INFO = REGEXP_REPLACE(
 5759                                                concat(table_variants.INFO,''),
 5760                                                ';*{annotation_fields_new_name}=[^;]*',
 5761                                                ''
 5762                                                )
 5763                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5764                                """
 5765                                annotation_fields_new_name_info_msg = " [update]"
 5766                                query_dict_remove[
 5767                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5768                                ] = query
 5769
 5770                            # Sep between fields in INFO
 5771                            nb_annotation_field += 1
 5772                            if nb_annotation_field > 1:
 5773                                annotation_field_sep = ";"
 5774                            else:
 5775                                annotation_field_sep = ""
 5776
 5777                            log.info(
 5778                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5779                            )
 5780
 5781                            # Add INFO field to header
 5782                            parquet_hdr_vcf_header_infos_number = (
 5783                                parquet_hdr_vcf_header_infos[annotation_field].num
 5784                                or "."
 5785                            )
 5786                            parquet_hdr_vcf_header_infos_type = (
 5787                                parquet_hdr_vcf_header_infos[annotation_field].type
 5788                                or "String"
 5789                            )
 5790                            parquet_hdr_vcf_header_infos_description = (
 5791                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5792                                or f"{annotation_field} description"
 5793                            )
 5794                            parquet_hdr_vcf_header_infos_source = (
 5795                                parquet_hdr_vcf_header_infos[annotation_field].source
 5796                                or "unknown"
 5797                            )
 5798                            parquet_hdr_vcf_header_infos_version = (
 5799                                parquet_hdr_vcf_header_infos[annotation_field].version
 5800                                or "unknown"
 5801                            )
 5802
 5803                            vcf_reader.infos[annotation_fields_new_name] = (
 5804                                vcf.parser._Info(
 5805                                    annotation_fields_new_name,
 5806                                    parquet_hdr_vcf_header_infos_number,
 5807                                    parquet_hdr_vcf_header_infos_type,
 5808                                    parquet_hdr_vcf_header_infos_description,
 5809                                    parquet_hdr_vcf_header_infos_source,
 5810                                    parquet_hdr_vcf_header_infos_version,
 5811                                    self.code_type_map[
 5812                                        parquet_hdr_vcf_header_infos_type
 5813                                    ],
 5814                                )
 5815                            )
 5816
 5817                            # Append
 5818                            if force_append_annotation:
 5819                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5820                            else:
 5821                                query_case_when_append = ""
 5822
 5823                            # Annotation/Update query fields
 5824                            # Found in INFO column
 5825                            if (
 5826                                annotation_field_column == "INFO"
 5827                                and "INFO" in parquet_hdr_vcf_header_columns
 5828                            ):
 5829                                sql_query_annotation_update_info_sets.append(
 5830                                    f"""
 5831                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5832                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5833                                        ELSE ''
 5834                                    END
 5835                                """
 5836                                )
 5837                            # Found in a specific column
 5838                            else:
 5839                                sql_query_annotation_update_info_sets.append(
 5840                                    f"""
 5841                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 5842                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 5843                                        ELSE ''
 5844                                    END
 5845                                """
 5846                                )
 5847                                sql_query_annotation_to_agregate.append(
 5848                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5849                                )
 5850
 5851                        # Not to annotate
 5852                        else:
 5853
 5854                            if force_update_annotation:
 5855                                annotation_message = "forced"
 5856                            else:
 5857                                annotation_message = "skipped"
 5858
 5859                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5860                                log.warning(
 5861                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5862                                )
 5863                            if annotation_fields_new_name in self.get_header().infos:
 5864                                log.warning(
 5865                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5866                                )
 5867
 5868                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5869                    # allow_annotation_full_info = True
 5870                    allow_annotation_full_info = not force_append_annotation
 5871
 5872                    if parquet_type in ["regions"]:
 5873                        allow_annotation_full_info = False
 5874
 5875                    if (
 5876                        allow_annotation_full_info
 5877                        and nb_annotation_field == len(annotation_fields)
 5878                        and annotation_fields_all
 5879                        and (
 5880                            "INFO" in parquet_hdr_vcf_header_columns
 5881                            and "INFO" in database.get_extra_columns()
 5882                        )
 5883                    ):
 5884                        log.debug("Column INFO annotation enabled")
 5885                        sql_query_annotation_update_info_sets = []
 5886                        sql_query_annotation_update_info_sets.append(
 5887                            f" table_parquet.INFO "
 5888                        )
 5889
 5890                    if sql_query_annotation_update_info_sets:
 5891
 5892                        # Annotate
 5893                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5894
 5895                        # Join query annotation update info sets for SQL
 5896                        sql_query_annotation_update_info_sets_sql = ",".join(
 5897                            sql_query_annotation_update_info_sets
 5898                        )
 5899
 5900                        # Check chromosomes list (and variants infos)
 5901                        sql_query_chromosomes = f"""
 5902                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5903                            FROM {table_variants} as table_variants
 5904                            GROUP BY table_variants."#CHROM"
 5905                            ORDER BY table_variants."#CHROM"
 5906                            """
 5907                        sql_query_chromosomes_df = self.conn.execute(
 5908                            sql_query_chromosomes
 5909                        ).df()
 5910                        sql_query_chromosomes_dict = {
 5911                            entry["CHROM"]: {
 5912                                "count": entry["count_variants"],
 5913                                "min": entry["min_variants"],
 5914                                "max": entry["max_variants"],
 5915                            }
 5916                            for index, entry in sql_query_chromosomes_df.iterrows()
 5917                        }
 5918
 5919                        # Init
 5920                        nb_of_query = 0
 5921                        nb_of_variant_annotated = 0
 5922                        query_dict = query_dict_remove
 5923
 5924                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5925                        for chrom in sql_query_chromosomes_dict:
 5926
 5927                            # Number of variant by chromosome
 5928                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5929                                chrom, {}
 5930                            ).get("count", 0)
 5931
 5932                            log.debug(
 5933                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5934                            )
 5935
 5936                            # Annotation with regions database
 5937                            if parquet_type in ["regions"]:
 5938                                sql_query_annotation_from_clause = f"""
 5939                                    FROM (
 5940                                        SELECT 
 5941                                            '{chrom}' AS \"#CHROM\",
 5942                                            table_variants_from.\"POS\" AS \"POS\",
 5943                                            {",".join(sql_query_annotation_to_agregate)}
 5944                                        FROM {table_variants} as table_variants_from
 5945                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5946                                            table_parquet_from."#CHROM" = '{chrom}'
 5947                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5948                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5949                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5950                                                )
 5951                                        )
 5952                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5953                                        GROUP BY table_variants_from.\"POS\"
 5954                                        )
 5955                                        as table_parquet
 5956                                """
 5957
 5958                                sql_query_annotation_where_clause = """
 5959                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5960                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5961                                """
 5962
 5963                            # Annotation with variants database
 5964                            else:
 5965                                sql_query_annotation_from_clause = f"""
 5966                                    FROM {parquet_file_link} as table_parquet
 5967                                """
 5968                                sql_query_annotation_where_clause = f"""
 5969                                    table_variants."#CHROM" = '{chrom}'
 5970                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5971                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5972                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5973                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5974                                """
 5975
 5976                            # Create update query
 5977                            sql_query_annotation_chrom_interval_pos = f"""
 5978                                UPDATE {table_variants} as table_variants
 5979                                    SET INFO = 
 5980                                        concat(
 5981                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5982                                                THEN table_variants.INFO
 5983                                                ELSE ''
 5984                                            END
 5985                                            ,
 5986                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5987                                                        AND (
 5988                                                        concat({sql_query_annotation_update_info_sets_sql})
 5989                                                        )
 5990                                                        NOT IN ('','.') 
 5991                                                    THEN ';'
 5992                                                    ELSE ''
 5993                                            END
 5994                                            ,
 5995                                            {sql_query_annotation_update_info_sets_sql}
 5996                                            )
 5997                                    {sql_query_annotation_from_clause}
 5998                                    WHERE {sql_query_annotation_where_clause}
 5999                                    ;
 6000                                """
 6001
 6002                            # Add update query to dict
 6003                            query_dict[
 6004                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6005                            ] = sql_query_annotation_chrom_interval_pos
 6006
 6007                        nb_of_query = len(query_dict)
 6008                        num_query = 0
 6009
 6010                        # SET max_expression_depth TO x
 6011                        self.conn.execute("SET max_expression_depth TO 10000")
 6012
 6013                        for query_name in query_dict:
 6014                            query = query_dict[query_name]
 6015                            num_query += 1
 6016                            log.info(
 6017                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6018                            )
 6019                            result = self.conn.execute(query)
 6020                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6021                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6022                            log.info(
 6023                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6024                            )
 6025
 6026                        log.info(
 6027                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6028                        )
 6029
 6030                    else:
 6031
 6032                        log.info(
 6033                            f"Annotation '{annotation_name}' - No Annotations available"
 6034                        )
 6035
 6036                    log.debug("Final header: " + str(vcf_reader.infos))
 6037
 6038        # Remove added columns
 6039        for added_column in added_columns:
 6040            self.drop_column(column=added_column)
 6041
 6042    def annotation_splice(self, threads: int = None) -> None:
 6043        """
 6044        This function annotate with snpEff
 6045
 6046        :param threads: The number of threads to use
 6047        :return: the value of the variable "return_value".
 6048        """
 6049
 6050        # DEBUG
 6051        log.debug("Start annotation with splice tools")
 6052
 6053        # Threads
 6054        if not threads:
 6055            threads = self.get_threads()
 6056        log.debug("Threads: " + str(threads))
 6057
 6058        # DEBUG
 6059        delete_tmp = True
 6060        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6061            delete_tmp = False
 6062            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6063
 6064        # Config
 6065        config = self.get_config()
 6066        log.debug("Config: " + str(config))
 6067        splice_config = config.get("tools", {}).get("splice", {})
 6068        if not splice_config:
 6069            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6070        if not splice_config:
 6071            msg_err = "No Splice tool config"
 6072            log.error(msg_err)
 6073            raise ValueError(msg_err)
 6074        log.debug(f"splice_config={splice_config}")
 6075
 6076        # Config - Folders - Databases
 6077        databases_folders = (
 6078            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6079        )
 6080        log.debug("Databases annotations: " + str(databases_folders))
 6081
 6082        # Splice docker image
 6083        splice_docker_image = splice_config.get("docker").get("image")
 6084
 6085        # Pull splice image if it's not already there
 6086        if not check_docker_image_exists(splice_docker_image):
 6087            log.warning(
 6088                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6089            )
 6090            try:
 6091                command(f"docker pull {splice_config.get('docker').get('image')}")
 6092            except subprocess.CalledProcessError:
 6093                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6094                log.error(msg_err)
 6095                raise ValueError(msg_err)
 6096                return None
 6097
 6098        # Config - splice databases
 6099        splice_databases = (
 6100            config.get("folders", {})
 6101            .get("databases", {})
 6102            .get("splice", DEFAULT_SPLICE_FOLDER)
 6103        )
 6104        splice_databases = full_path(splice_databases)
 6105
 6106        # Param
 6107        param = self.get_param()
 6108        log.debug("Param: " + str(param))
 6109
 6110        # Param
 6111        options = param.get("annotation", {}).get("splice", {})
 6112        log.debug("Options: " + str(options))
 6113
 6114        # Data
 6115        table_variants = self.get_table_variants()
 6116
 6117        # Check if not empty
 6118        log.debug("Check if not empty")
 6119        sql_query_chromosomes = (
 6120            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6121        )
 6122        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6123            log.info("VCF empty")
 6124            return None
 6125
 6126        # Export in VCF
 6127        log.debug("Create initial file to annotate")
 6128
 6129        # Create output folder
 6130        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6131        if not os.path.exists(output_folder):
 6132            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6133
 6134        # Create tmp VCF file
 6135        tmp_vcf = NamedTemporaryFile(
 6136            prefix=self.get_prefix(),
 6137            dir=output_folder,
 6138            suffix=".vcf",
 6139            delete=False,
 6140        )
 6141        tmp_vcf_name = tmp_vcf.name
 6142
 6143        # VCF header
 6144        header = self.get_header()
 6145
 6146        # Existing annotations
 6147        for vcf_annotation in self.get_header().infos:
 6148
 6149            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6150            log.debug(
 6151                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6152            )
 6153
 6154        # Memory limit
 6155        if config.get("memory", None):
 6156            memory_limit = config.get("memory", "8G").upper()
 6157            # upper()
 6158        else:
 6159            memory_limit = "8G"
 6160        log.debug(f"memory_limit: {memory_limit}")
 6161
 6162        # Check number of variants to annotate
 6163        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6164        where_clause_regex_spip = r"SPiP_\w+"
 6165        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6166        df_list_of_variants_to_annotate = self.get_query_to_df(
 6167            query=f""" SELECT * FROM variants {where_clause} """
 6168        )
 6169        if len(df_list_of_variants_to_annotate) == 0:
 6170            log.warning(
 6171                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6172            )
 6173            return None
 6174        else:
 6175            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6176
 6177        # Export VCF file
 6178        self.export_variant_vcf(
 6179            vcf_file=tmp_vcf_name,
 6180            remove_info=True,
 6181            add_samples=True,
 6182            index=False,
 6183            where_clause=where_clause,
 6184        )
 6185
 6186        # Create docker container and launch splice analysis
 6187        if splice_config:
 6188
 6189            # Splice mount folders
 6190            mount_folders = splice_config.get("mount", {})
 6191
 6192            # Genome mount
 6193            mount_folders[
 6194                config.get("folders", {})
 6195                .get("databases", {})
 6196                .get("genomes", DEFAULT_GENOME_FOLDER)
 6197            ] = "ro"
 6198
 6199            # SpliceAI mount
 6200            mount_folders[
 6201                config.get("folders", {})
 6202                .get("databases", {})
 6203                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6204            ] = "ro"
 6205
 6206            # Genome mount
 6207            mount_folders[
 6208                config.get("folders", {})
 6209                .get("databases", {})
 6210                .get("spip", DEFAULT_SPIP_FOLDER)
 6211            ] = "ro"
 6212
 6213            # Mount folders
 6214            mount = []
 6215
 6216            # Config mount
 6217            mount = [
 6218                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6219                for path, mode in mount_folders.items()
 6220            ]
 6221
 6222            if any(value for value in splice_config.values() if value is None):
 6223                log.warning("At least one splice config parameter is empty")
 6224                return None
 6225
 6226            # Params in splice nf
 6227            def check_values(dico: dict):
 6228                """
 6229                Ensure parameters for NF splice pipeline
 6230                """
 6231                for key, val in dico.items():
 6232                    if key == "genome":
 6233                        if any(
 6234                            assemb in options.get("genome", {})
 6235                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6236                        ):
 6237                            yield f"--{key} hg19"
 6238                        elif any(
 6239                            assemb in options.get("genome", {})
 6240                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6241                        ):
 6242                            yield f"--{key} hg38"
 6243                    elif (
 6244                        (isinstance(val, str) and val)
 6245                        or isinstance(val, int)
 6246                        or isinstance(val, bool)
 6247                    ):
 6248                        yield f"--{key} {val}"
 6249
 6250            # Genome
 6251            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6252            options["genome"] = genome
 6253
 6254            # NF params
 6255            nf_params = []
 6256
 6257            # Add options
 6258            if options:
 6259                nf_params = list(check_values(options))
 6260                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6261            else:
 6262                log.debug("No NF params provided")
 6263
 6264            # Add threads
 6265            if "threads" not in options.keys():
 6266                nf_params.append(f"--threads {threads}")
 6267
 6268            # Genome path
 6269            genome_path = find_genome(
 6270                config.get("folders", {})
 6271                .get("databases", {})
 6272                .get("genomes", DEFAULT_GENOME_FOLDER),
 6273                file=f"{genome}.fa",
 6274            )
 6275            # Add genome path
 6276            if not genome_path:
 6277                raise ValueError(
 6278                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6279                )
 6280            else:
 6281                log.debug(f"Genome: {genome_path}")
 6282                nf_params.append(f"--genome_path {genome_path}")
 6283
 6284            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6285                """
 6286                Setting up updated databases for SPiP and SpliceAI
 6287                """
 6288
 6289                try:
 6290
 6291                    # SpliceAI assembly transcriptome
 6292                    spliceai_assembly = os.path.join(
 6293                        config.get("folders", {})
 6294                        .get("databases", {})
 6295                        .get("spliceai", {}),
 6296                        options.get("genome"),
 6297                        "transcriptome",
 6298                    )
 6299                    spip_assembly = options.get("genome")
 6300
 6301                    spip = find(
 6302                        f"transcriptome_{spip_assembly}.RData",
 6303                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6304                    )
 6305                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6306                    log.debug(f"SPiP annotations: {spip}")
 6307                    log.debug(f"SpliceAI annotations: {spliceai}")
 6308                    if spip and spliceai:
 6309                        return [
 6310                            f"--spip_transcriptome {spip}",
 6311                            f"--spliceai_annotations {spliceai}",
 6312                        ]
 6313                    else:
 6314                        # TODO crash and go on with basic annotations ?
 6315                        # raise ValueError(
 6316                        #     "Can't find splice databases in configuration EXIT"
 6317                        # )
 6318                        log.warning(
 6319                            "Can't find splice databases in configuration, use annotations file from image"
 6320                        )
 6321                except TypeError:
 6322                    log.warning(
 6323                        "Can't find splice databases in configuration, use annotations file from image"
 6324                    )
 6325                    return []
 6326
 6327            # Add options, check if transcriptome option have already beend provided
 6328            if (
 6329                "spip_transcriptome" not in nf_params
 6330                and "spliceai_transcriptome" not in nf_params
 6331            ):
 6332                splice_reference = splice_annotations(options, config)
 6333                if splice_reference:
 6334                    nf_params.extend(splice_reference)
 6335
 6336            nf_params.append(f"--output_folder {output_folder}")
 6337
 6338            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6339            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6340            log.debug(cmd)
 6341
 6342            splice_config["docker"]["command"] = cmd
 6343
 6344            docker_cmd = get_bin_command(
 6345                tool="splice",
 6346                bin_type="docker",
 6347                config=config,
 6348                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6349                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6350            )
 6351
 6352            # Docker debug
 6353            # if splice_config.get("rm_container"):
 6354            #     rm_container = "--rm"
 6355            # else:
 6356            #     rm_container = ""
 6357            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6358
 6359            log.debug(docker_cmd)
 6360            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6361            log.debug(res.stdout)
 6362            if res.stderr:
 6363                log.error(res.stderr)
 6364            res.check_returncode()
 6365        else:
 6366            log.warning(f"Splice tool configuration not found: {config}")
 6367
 6368        # Update variants
 6369        log.info("Annotation - Updating...")
 6370        # Test find output vcf
 6371        log.debug(
 6372            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6373        )
 6374        output_vcf = []
 6375        # Wrong folder to look in
 6376        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6377            if (
 6378                files
 6379                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6380            ):
 6381                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6382        # log.debug(os.listdir(options.get("output_folder")))
 6383        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6384        if not output_vcf:
 6385            log.debug(
 6386                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6387            )
 6388        else:
 6389            # Get new header from annotated vcf
 6390            log.debug(f"Initial header: {len(header.infos)} fields")
 6391            # Create new header with splice infos
 6392            new_vcf = Variants(input=output_vcf[0])
 6393            new_vcf_header = new_vcf.get_header().infos
 6394            for keys, infos in new_vcf_header.items():
 6395                if keys not in header.infos.keys():
 6396                    header.infos[keys] = infos
 6397            log.debug(f"New header: {len(header.infos)} fields")
 6398            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6399            self.update_from_vcf(output_vcf[0])
 6400
 6401        # Remove folder
 6402        remove_if_exists(output_folder)
 6403
 6404    ###
 6405    # Prioritization
 6406    ###
 6407
 6408    def get_config_default(self, name: str) -> dict:
 6409        """
 6410        The function `get_config_default` returns a dictionary containing default configurations for
 6411        various calculations and prioritizations.
 6412
 6413        :param name: The `get_config_default` function returns a dictionary containing default
 6414        configurations for different calculations and prioritizations. The `name` parameter is used to
 6415        specify which specific configuration to retrieve from the dictionary
 6416        :type name: str
 6417        :return: The function `get_config_default` returns a dictionary containing default configuration
 6418        settings for different calculations and prioritizations. The specific configuration settings are
 6419        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6420        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6421        returned. If there is no match, an empty dictionary is returned.
 6422        """
 6423
 6424        config_default = {
 6425            "calculations": {
 6426                "variant_chr_pos_alt_ref": {
 6427                    "type": "sql",
 6428                    "name": "variant_chr_pos_alt_ref",
 6429                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6430                    "available": False,
 6431                    "output_column_name": "variant_chr_pos_alt_ref",
 6432                    "output_column_type": "String",
 6433                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6434                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6435                    "operation_info": True,
 6436                },
 6437                "VARTYPE": {
 6438                    "type": "sql",
 6439                    "name": "VARTYPE",
 6440                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6441                    "available": True,
 6442                    "output_column_name": "VARTYPE",
 6443                    "output_column_type": "String",
 6444                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6445                    "operation_query": """
 6446                            CASE
 6447                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6448                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6449                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6450                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6451                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6452                                ELSE 'UNDEFINED'
 6453                            END
 6454                            """,
 6455                    "info_fields": ["SVTYPE"],
 6456                    "operation_info": True,
 6457                },
 6458                "snpeff_hgvs": {
 6459                    "type": "python",
 6460                    "name": "snpeff_hgvs",
 6461                    "description": "HGVS nomenclatures from snpEff annotation",
 6462                    "available": True,
 6463                    "function_name": "calculation_extract_snpeff_hgvs",
 6464                    "function_params": ["snpeff_hgvs", "ANN"],
 6465                },
 6466                "snpeff_ann_explode": {
 6467                    "type": "python",
 6468                    "name": "snpeff_ann_explode",
 6469                    "description": "Explode snpEff annotations with uniquify values",
 6470                    "available": True,
 6471                    "function_name": "calculation_snpeff_ann_explode",
 6472                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6473                },
 6474                "snpeff_ann_explode_uniquify": {
 6475                    "type": "python",
 6476                    "name": "snpeff_ann_explode_uniquify",
 6477                    "description": "Explode snpEff annotations",
 6478                    "available": True,
 6479                    "function_name": "calculation_snpeff_ann_explode",
 6480                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6481                },
 6482                "snpeff_ann_explode_json": {
 6483                    "type": "python",
 6484                    "name": "snpeff_ann_explode_json",
 6485                    "description": "Explode snpEff annotations in JSON format",
 6486                    "available": True,
 6487                    "function_name": "calculation_snpeff_ann_explode",
 6488                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6489                },
 6490                "NOMEN": {
 6491                    "type": "python",
 6492                    "name": "NOMEN",
 6493                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6494                    "available": True,
 6495                    "function_name": "calculation_extract_nomen",
 6496                    "function_params": [],
 6497                },
 6498                "FINDBYPIPELINE": {
 6499                    "type": "python",
 6500                    "name": "FINDBYPIPELINE",
 6501                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6502                    "available": True,
 6503                    "function_name": "calculation_find_by_pipeline",
 6504                    "function_params": ["findbypipeline"],
 6505                },
 6506                "FINDBYSAMPLE": {
 6507                    "type": "python",
 6508                    "name": "FINDBYSAMPLE",
 6509                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6510                    "available": True,
 6511                    "function_name": "calculation_find_by_pipeline",
 6512                    "function_params": ["findbysample"],
 6513                },
 6514                "GENOTYPECONCORDANCE": {
 6515                    "type": "python",
 6516                    "name": "GENOTYPECONCORDANCE",
 6517                    "description": "Concordance of genotype for multi caller VCF",
 6518                    "available": True,
 6519                    "function_name": "calculation_genotype_concordance",
 6520                    "function_params": [],
 6521                },
 6522                "BARCODE": {
 6523                    "type": "python",
 6524                    "name": "BARCODE",
 6525                    "description": "BARCODE as VaRank tool",
 6526                    "available": True,
 6527                    "function_name": "calculation_barcode",
 6528                    "function_params": [],
 6529                },
 6530                "BARCODEFAMILY": {
 6531                    "type": "python",
 6532                    "name": "BARCODEFAMILY",
 6533                    "description": "BARCODEFAMILY as VaRank tool",
 6534                    "available": True,
 6535                    "function_name": "calculation_barcode_family",
 6536                    "function_params": ["BCF"],
 6537                },
 6538                "TRIO": {
 6539                    "type": "python",
 6540                    "name": "TRIO",
 6541                    "description": "Inheritance for a trio family",
 6542                    "available": True,
 6543                    "function_name": "calculation_trio",
 6544                    "function_params": [],
 6545                },
 6546                "VAF": {
 6547                    "type": "python",
 6548                    "name": "VAF",
 6549                    "description": "Variant Allele Frequency (VAF) harmonization",
 6550                    "available": True,
 6551                    "function_name": "calculation_vaf_normalization",
 6552                    "function_params": [],
 6553                },
 6554                "VAF_stats": {
 6555                    "type": "python",
 6556                    "name": "VAF_stats",
 6557                    "description": "Variant Allele Frequency (VAF) statistics",
 6558                    "available": True,
 6559                    "function_name": "calculation_genotype_stats",
 6560                    "function_params": ["VAF"],
 6561                },
 6562                "DP_stats": {
 6563                    "type": "python",
 6564                    "name": "DP_stats",
 6565                    "description": "Depth (DP) statistics",
 6566                    "available": True,
 6567                    "function_name": "calculation_genotype_stats",
 6568                    "function_params": ["DP"],
 6569                },
 6570                "variant_id": {
 6571                    "type": "python",
 6572                    "name": "variant_id",
 6573                    "description": "Variant ID generated from variant position and type",
 6574                    "available": True,
 6575                    "function_name": "calculation_variant_id",
 6576                    "function_params": [],
 6577                },
 6578                "transcripts_json": {
 6579                    "type": "python",
 6580                    "name": "transcripts_json",
 6581                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6582                    "available": True,
 6583                    "function_name": "calculation_transcripts_annotation",
 6584                    "function_params": ["transcripts_json", None],
 6585                },
 6586                "transcripts_ann": {
 6587                    "type": "python",
 6588                    "name": "transcripts_ann",
 6589                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6590                    "available": True,
 6591                    "function_name": "calculation_transcripts_annotation",
 6592                    "function_params": [None, "transcripts_ann"],
 6593                },
 6594                "transcripts_annotations": {
 6595                    "type": "python",
 6596                    "name": "transcripts_annotations",
 6597                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6598                    "available": True,
 6599                    "function_name": "calculation_transcripts_annotation",
 6600                    "function_params": [None, None],
 6601                },
 6602                "transcripts_prioritization": {
 6603                    "type": "python",
 6604                    "name": "transcripts_prioritization",
 6605                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6606                    "available": True,
 6607                    "function_name": "calculation_transcripts_prioritization",
 6608                    "function_params": [],
 6609                },
 6610            },
 6611            "prioritizations": {
 6612                "default": {
 6613                    "ANN2": [
 6614                        {
 6615                            "type": "contains",
 6616                            "value": "HIGH",
 6617                            "score": 5,
 6618                            "flag": "PASS",
 6619                            "comment": [
 6620                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6621                            ],
 6622                        },
 6623                        {
 6624                            "type": "contains",
 6625                            "value": "MODERATE",
 6626                            "score": 3,
 6627                            "flag": "PASS",
 6628                            "comment": [
 6629                                "A non-disruptive variant that might change protein effectiveness"
 6630                            ],
 6631                        },
 6632                        {
 6633                            "type": "contains",
 6634                            "value": "LOW",
 6635                            "score": 0,
 6636                            "flag": "FILTERED",
 6637                            "comment": [
 6638                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6639                            ],
 6640                        },
 6641                        {
 6642                            "type": "contains",
 6643                            "value": "MODIFIER",
 6644                            "score": 0,
 6645                            "flag": "FILTERED",
 6646                            "comment": [
 6647                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6648                            ],
 6649                        },
 6650                    ],
 6651                }
 6652            },
 6653        }
 6654
 6655        return config_default.get(name, None)
 6656
 6657    def get_config_json(
 6658        self, name: str, config_dict: dict = {}, config_file: str = None
 6659    ) -> dict:
 6660        """
 6661        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6662        default values, a dictionary, and a file.
 6663
 6664        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6665        the name of the configuration. It is used to identify and retrieve the configuration settings
 6666        for a specific component or module
 6667        :type name: str
 6668        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6669        dictionary that allows you to provide additional configuration settings or overrides. When you
 6670        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6671        the key is the configuration setting you want to override or
 6672        :type config_dict: dict
 6673        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6674        specify the path to a configuration file that contains additional settings. If provided, the
 6675        function will read the contents of this file and update the configuration dictionary with the
 6676        values found in the file, overriding any existing values with the
 6677        :type config_file: str
 6678        :return: The function `get_config_json` returns a dictionary containing the configuration
 6679        settings.
 6680        """
 6681
 6682        # Create with default prioritizations
 6683        config_default = self.get_config_default(name=name)
 6684        configuration = config_default
 6685        # log.debug(f"configuration={configuration}")
 6686
 6687        # Replace prioritizations from dict
 6688        for config in config_dict:
 6689            configuration[config] = config_dict[config]
 6690
 6691        # Replace prioritizations from file
 6692        config_file = full_path(config_file)
 6693        if config_file:
 6694            if os.path.exists(config_file):
 6695                with open(config_file) as config_file_content:
 6696                    config_file_dict = json.load(config_file_content)
 6697                for config in config_file_dict:
 6698                    configuration[config] = config_file_dict[config]
 6699            else:
 6700                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6701                log.error(msg_error)
 6702                raise ValueError(msg_error)
 6703
 6704        return configuration
 6705
 6706    def prioritization(
 6707        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6708    ) -> bool:
 6709        """
 6710        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6711        prioritizes variants based on configured profiles and criteria.
 6712
 6713        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6714        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6715        a table name is provided, the method will prioritize the variants in that specific table
 6716        :type table: str
 6717        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6718        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6719        provided, the code will use a default prefix value of "PZ"
 6720        :type pz_prefix: str
 6721        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6722        additional parameters specific to the prioritization process. These parameters can include
 6723        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6724        configurations needed for the prioritization of variants in a V
 6725        :type pz_param: dict
 6726        :return: A boolean value (True) is being returned from the `prioritization` function.
 6727        """
 6728
 6729        # Config
 6730        config = self.get_config()
 6731
 6732        # Param
 6733        param = self.get_param()
 6734
 6735        # Prioritization param
 6736        if pz_param is not None:
 6737            prioritization_param = pz_param
 6738        else:
 6739            prioritization_param = param.get("prioritization", {})
 6740
 6741        # Configuration profiles
 6742        prioritization_config_file = prioritization_param.get(
 6743            "prioritization_config", None
 6744        )
 6745        prioritization_config_file = full_path(prioritization_config_file)
 6746        prioritizations_config = self.get_config_json(
 6747            name="prioritizations", config_file=prioritization_config_file
 6748        )
 6749
 6750        # Prioritization prefix
 6751        pz_prefix_default = "PZ"
 6752        if pz_prefix is None:
 6753            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6754
 6755        # Prioritization options
 6756        profiles = prioritization_param.get("profiles", [])
 6757        if isinstance(profiles, str):
 6758            profiles = profiles.split(",")
 6759        pzfields = prioritization_param.get(
 6760            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6761        )
 6762        if isinstance(pzfields, str):
 6763            pzfields = pzfields.split(",")
 6764        default_profile = prioritization_param.get("default_profile", None)
 6765        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6766        prioritization_score_mode = prioritization_param.get(
 6767            "prioritization_score_mode", "HOWARD"
 6768        )
 6769
 6770        # Quick Prioritizations
 6771        prioritizations = param.get("prioritizations", None)
 6772        if prioritizations:
 6773            log.info("Quick Prioritization:")
 6774            for profile in prioritizations.split(","):
 6775                if profile not in profiles:
 6776                    profiles.append(profile)
 6777                    log.info(f"   {profile}")
 6778
 6779        # If profile "ALL" provided, all profiles in the config profiles
 6780        if "ALL" in profiles:
 6781            profiles = list(prioritizations_config.keys())
 6782
 6783        for profile in profiles:
 6784            if prioritizations_config.get(profile, None):
 6785                log.debug(f"Profile '{profile}' configured")
 6786            else:
 6787                msg_error = f"Profile '{profile}' NOT configured"
 6788                log.error(msg_error)
 6789                raise ValueError(msg_error)
 6790
 6791        if profiles:
 6792            log.info(f"Prioritization... ")
 6793        else:
 6794            log.debug(f"No profile defined")
 6795            return False
 6796
 6797        if not default_profile and len(profiles):
 6798            default_profile = profiles[0]
 6799
 6800        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6801        log.debug("Profiles to check: " + str(list(profiles)))
 6802
 6803        # Variables
 6804        if table is not None:
 6805            table_variants = table
 6806        else:
 6807            table_variants = self.get_table_variants(clause="update")
 6808        log.debug(f"Table to prioritize: {table_variants}")
 6809
 6810        # Added columns
 6811        added_columns = []
 6812
 6813        # Create list of PZfields
 6814        # List of PZFields
 6815        list_of_pzfields_original = pzfields + [
 6816            pzfield + pzfields_sep + profile
 6817            for pzfield in pzfields
 6818            for profile in profiles
 6819        ]
 6820        list_of_pzfields = []
 6821        log.debug(f"{list_of_pzfields_original}")
 6822
 6823        # Remove existing PZfields to use if exists
 6824        for pzfield in list_of_pzfields_original:
 6825            if self.get_header().infos.get(pzfield, None) is None:
 6826                list_of_pzfields.append(pzfield)
 6827                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6828            else:
 6829                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6830
 6831        if list_of_pzfields:
 6832
 6833            # Explode Infos prefix
 6834            explode_infos_prefix = self.get_explode_infos_prefix()
 6835
 6836            # PZfields tags description
 6837            PZfields_INFOS = {
 6838                f"{pz_prefix}Tags": {
 6839                    "ID": f"{pz_prefix}Tags",
 6840                    "Number": ".",
 6841                    "Type": "String",
 6842                    "Description": "Variant tags based on annotation criteria",
 6843                },
 6844                f"{pz_prefix}Score": {
 6845                    "ID": f"{pz_prefix}Score",
 6846                    "Number": 1,
 6847                    "Type": "Integer",
 6848                    "Description": "Variant score based on annotation criteria",
 6849                },
 6850                f"{pz_prefix}Flag": {
 6851                    "ID": f"{pz_prefix}Flag",
 6852                    "Number": 1,
 6853                    "Type": "String",
 6854                    "Description": "Variant flag based on annotation criteria",
 6855                },
 6856                f"{pz_prefix}Comment": {
 6857                    "ID": f"{pz_prefix}Comment",
 6858                    "Number": ".",
 6859                    "Type": "String",
 6860                    "Description": "Variant comment based on annotation criteria",
 6861                },
 6862                f"{pz_prefix}Infos": {
 6863                    "ID": f"{pz_prefix}Infos",
 6864                    "Number": ".",
 6865                    "Type": "String",
 6866                    "Description": "Variant infos based on annotation criteria",
 6867                },
 6868                f"{pz_prefix}Class": {
 6869                    "ID": f"{pz_prefix}Class",
 6870                    "Number": ".",
 6871                    "Type": "String",
 6872                    "Description": "Variant class based on annotation criteria",
 6873                },
 6874            }
 6875
 6876            # Create INFO fields if not exist
 6877            for field in PZfields_INFOS:
 6878                field_ID = PZfields_INFOS[field]["ID"]
 6879                field_description = PZfields_INFOS[field]["Description"]
 6880                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6881                    field_description = (
 6882                        PZfields_INFOS[field]["Description"]
 6883                        + f", profile {default_profile}"
 6884                    )
 6885                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6886                        field_ID,
 6887                        PZfields_INFOS[field]["Number"],
 6888                        PZfields_INFOS[field]["Type"],
 6889                        field_description,
 6890                        "unknown",
 6891                        "unknown",
 6892                        code_type_map[PZfields_INFOS[field]["Type"]],
 6893                    )
 6894
 6895            # Create INFO fields if not exist for each profile
 6896            for profile in prioritizations_config:
 6897                if profile in profiles or profiles == []:
 6898                    for field in PZfields_INFOS:
 6899                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6900                        field_description = (
 6901                            PZfields_INFOS[field]["Description"]
 6902                            + f", profile {profile}"
 6903                        )
 6904                        if (
 6905                            field_ID not in self.get_header().infos
 6906                            and field in pzfields
 6907                        ):
 6908                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6909                                field_ID,
 6910                                PZfields_INFOS[field]["Number"],
 6911                                PZfields_INFOS[field]["Type"],
 6912                                field_description,
 6913                                "unknown",
 6914                                "unknown",
 6915                                code_type_map[PZfields_INFOS[field]["Type"]],
 6916                            )
 6917
 6918            # Header
 6919            for pzfield in list_of_pzfields:
 6920                if re.match(f"{pz_prefix}Score.*", pzfield):
 6921                    added_column = self.add_column(
 6922                        table_name=table_variants,
 6923                        column_name=pzfield,
 6924                        column_type="INTEGER",
 6925                        default_value="0",
 6926                    )
 6927                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6928                    added_column = self.add_column(
 6929                        table_name=table_variants,
 6930                        column_name=pzfield,
 6931                        column_type="BOOLEAN",
 6932                        default_value="1",
 6933                    )
 6934                elif re.match(f"{pz_prefix}Class.*", pzfield):
 6935                    added_column = self.add_column(
 6936                        table_name=table_variants,
 6937                        column_name=pzfield,
 6938                        column_type="VARCHAR[]",
 6939                        default_value="null",
 6940                    )
 6941                else:
 6942                    added_column = self.add_column(
 6943                        table_name=table_variants,
 6944                        column_name=pzfield,
 6945                        column_type="STRING",
 6946                        default_value="''",
 6947                    )
 6948                added_columns.append(added_column)
 6949
 6950            # Profiles
 6951            if profiles:
 6952
 6953                # foreach profile in configuration file
 6954                for profile in prioritizations_config:
 6955
 6956                    # If profile is asked in param, or ALL are asked (empty profile [])
 6957                    if profile in profiles or profiles == []:
 6958                        log.info(f"Profile '{profile}'")
 6959
 6960                        sql_set_info_option = ""
 6961
 6962                        sql_set_info = []
 6963
 6964                        # PZ fields set
 6965
 6966                        # PZScore
 6967                        if (
 6968                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6969                            in list_of_pzfields
 6970                        ):
 6971                            sql_set_info.append(
 6972                                f"""
 6973                                    concat(
 6974                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6975                                        {pz_prefix}Score{pzfields_sep}{profile}
 6976                                    ) 
 6977                                """
 6978                            )
 6979                            if (
 6980                                profile == default_profile
 6981                                and f"{pz_prefix}Score" in list_of_pzfields
 6982                            ):
 6983                                sql_set_info.append(
 6984                                    f"""
 6985                                        concat(
 6986                                            '{pz_prefix}Score=',
 6987                                            {pz_prefix}Score{pzfields_sep}{profile}
 6988                                        )
 6989                                    """
 6990                                )
 6991
 6992                        # PZFlag
 6993                        if (
 6994                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6995                            in list_of_pzfields
 6996                        ):
 6997                            sql_set_info.append(
 6998                                f"""
 6999                                    concat(
 7000                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7001                                        CASE 
 7002                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7003                                            THEN 'PASS'
 7004                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7005                                            THEN 'FILTERED'
 7006                                        END
 7007                                    ) 
 7008                                """
 7009                            )
 7010                            if (
 7011                                profile == default_profile
 7012                                and f"{pz_prefix}Flag" in list_of_pzfields
 7013                            ):
 7014                                sql_set_info.append(
 7015                                    f"""
 7016                                        concat(
 7017                                            '{pz_prefix}Flag=',
 7018                                            CASE 
 7019                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7020                                                THEN 'PASS'
 7021                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7022                                                THEN 'FILTERED'
 7023                                            END
 7024                                        )
 7025                                    """
 7026                                )
 7027
 7028                        # PZClass
 7029                        if (
 7030                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7031                            in list_of_pzfields
 7032                        ):
 7033                            sql_set_info.append(
 7034                                f"""
 7035                                    concat(
 7036                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7037                                        CASE
 7038                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7039                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7040                                            ELSE '.'
 7041                                        END 
 7042                                    )
 7043                                    
 7044                                """
 7045                            )
 7046                            if (
 7047                                profile == default_profile
 7048                                and f"{pz_prefix}Class" in list_of_pzfields
 7049                            ):
 7050                                sql_set_info.append(
 7051                                    f"""
 7052                                        concat(
 7053                                            '{pz_prefix}Class=',
 7054                                            CASE
 7055                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7056                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7057                                                ELSE '.'
 7058                                            END 
 7059                                        )
 7060                                    """
 7061                                )
 7062
 7063                        # PZComment
 7064                        if (
 7065                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7066                            in list_of_pzfields
 7067                        ):
 7068                            sql_set_info.append(
 7069                                f"""
 7070                                    CASE
 7071                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7072                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7073                                        ELSE ''
 7074                                    END
 7075                                """
 7076                            )
 7077                            if (
 7078                                profile == default_profile
 7079                                and f"{pz_prefix}Comment" in list_of_pzfields
 7080                            ):
 7081                                sql_set_info.append(
 7082                                    f"""
 7083                                        CASE
 7084                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7085                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7086                                            ELSE ''
 7087                                        END
 7088                                    """
 7089                                )
 7090
 7091                        # PZInfos
 7092                        if (
 7093                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7094                            in list_of_pzfields
 7095                        ):
 7096                            sql_set_info.append(
 7097                                f"""
 7098                                    CASE
 7099                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7100                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7101                                        ELSE ''
 7102                                    END
 7103                                """
 7104                            )
 7105                            if (
 7106                                profile == default_profile
 7107                                and f"{pz_prefix}Infos" in list_of_pzfields
 7108                            ):
 7109                                sql_set_info.append(
 7110                                    f"""
 7111                                        CASE
 7112                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7113                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7114                                            ELSE ''
 7115                                        END
 7116                                    """
 7117                                )
 7118
 7119                        # Merge PZfields
 7120                        sql_set_info_option = ""
 7121                        sql_set_sep = ""
 7122                        for sql_set in sql_set_info:
 7123                            if sql_set_sep:
 7124                                sql_set_info_option += f"""
 7125                                    , concat('{sql_set_sep}', {sql_set})
 7126                                """
 7127                            else:
 7128                                sql_set_info_option += f"""
 7129                                    , {sql_set}
 7130                                """
 7131                            sql_set_sep = ";"
 7132
 7133                        sql_queries = []
 7134                        for annotation in prioritizations_config[profile]:
 7135
 7136                            # skip special sections
 7137                            if annotation.startswith("_"):
 7138                                continue
 7139
 7140                            # For each criterions
 7141                            for criterion in prioritizations_config[profile][
 7142                                annotation
 7143                            ]:
 7144
 7145                                # Criterion mode
 7146                                criterion_mode = None
 7147                                if np.any(
 7148                                    np.isin(list(criterion.keys()), ["type", "value"])
 7149                                ):
 7150                                    criterion_mode = "operation"
 7151                                elif np.any(
 7152                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7153                                ):
 7154                                    criterion_mode = "sql"
 7155                                log.debug(f"Criterion Mode: {criterion_mode}")
 7156
 7157                                # Criterion parameters
 7158                                criterion_type = criterion.get("type", None)
 7159                                criterion_value = criterion.get("value", None)
 7160                                criterion_sql = criterion.get("sql", None)
 7161                                criterion_fields = criterion.get("fields", None)
 7162                                criterion_score = criterion.get("score", 0)
 7163                                criterion_flag = criterion.get("flag", "PASS")
 7164                                criterion_class = criterion.get("class", None)
 7165                                criterion_flag_bool = criterion_flag == "PASS"
 7166                                criterion_comment = (
 7167                                    ", ".join(criterion.get("comment", []))
 7168                                    .replace("'", "''")
 7169                                    .replace(";", ",")
 7170                                    .replace("\t", " ")
 7171                                )
 7172                                criterion_infos = (
 7173                                    str(criterion)
 7174                                    .replace("'", "''")
 7175                                    .replace(";", ",")
 7176                                    .replace("\t", " ")
 7177                                )
 7178
 7179                                # SQL
 7180                                if criterion_sql is not None and isinstance(
 7181                                    criterion_sql, list
 7182                                ):
 7183                                    criterion_sql = " ".join(criterion_sql)
 7184
 7185                                # Fields and explode
 7186                                if criterion_fields is None:
 7187                                    criterion_fields = [annotation]
 7188                                if not isinstance(criterion_fields, list):
 7189                                    criterion_fields = str(criterion_fields).split(",")
 7190
 7191                                # Class
 7192                                if criterion_class is not None and not isinstance(
 7193                                    criterion_class, list
 7194                                ):
 7195                                    criterion_class = str(criterion_class).split(",")
 7196
 7197                                for annotation_field in criterion_fields:
 7198
 7199                                    # Explode specific annotation
 7200                                    log.debug(
 7201                                        f"Explode annotation '{annotation_field}'"
 7202                                    )
 7203                                    added_columns += self.explode_infos(
 7204                                        prefix=explode_infos_prefix,
 7205                                        fields=[annotation_field],
 7206                                        table=table_variants,
 7207                                    )
 7208                                    extra_infos = self.get_extra_infos(
 7209                                        table=table_variants
 7210                                    )
 7211
 7212                                    # Check if annotation field is present
 7213                                    if (
 7214                                        f"{explode_infos_prefix}{annotation_field}"
 7215                                        not in extra_infos
 7216                                    ):
 7217                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7218                                        log.error(msq_err)
 7219                                        raise ValueError(msq_err)
 7220                                    else:
 7221                                        log.debug(
 7222                                            f"Annotation '{annotation_field}' in data"
 7223                                        )
 7224
 7225                                sql_set = []
 7226                                sql_set_info = []
 7227
 7228                                # PZ fields set
 7229
 7230                                # PZScore
 7231                                if (
 7232                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7233                                    in list_of_pzfields
 7234                                ):
 7235                                    # if prioritization_score_mode == "HOWARD":
 7236                                    #     sql_set.append(
 7237                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7238                                    #     )
 7239                                    # VaRank prioritization score mode
 7240                                    if prioritization_score_mode == "VaRank":
 7241                                        sql_set.append(
 7242                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7243                                        )
 7244                                    # default HOWARD prioritization score mode
 7245                                    else:
 7246                                        sql_set.append(
 7247                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7248                                        )
 7249
 7250                                # PZFlag
 7251                                if (
 7252                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7253                                    in list_of_pzfields
 7254                                ):
 7255                                    sql_set.append(
 7256                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7257                                    )
 7258
 7259                                # PZClass
 7260                                if (
 7261                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7262                                    in list_of_pzfields
 7263                                    and criterion_class is not None
 7264                                ):
 7265                                    sql_set.append(
 7266                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7267                                    )
 7268
 7269                                # PZComment
 7270                                if (
 7271                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7272                                    in list_of_pzfields
 7273                                ):
 7274                                    sql_set.append(
 7275                                        f"""
 7276                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7277                                                concat(
 7278                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7279                                                    CASE 
 7280                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7281                                                        THEN ', '
 7282                                                        ELSE ''
 7283                                                    END,
 7284                                                    '{criterion_comment}'
 7285                                                )
 7286                                        """
 7287                                    )
 7288
 7289                                # PZInfos
 7290                                if (
 7291                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7292                                    in list_of_pzfields
 7293                                ):
 7294                                    sql_set.append(
 7295                                        f"""
 7296                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7297                                                concat(
 7298                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7299                                                    '{criterion_infos}'
 7300                                                )
 7301                                        """
 7302                                    )
 7303                                sql_set_option = ",".join(sql_set)
 7304
 7305                                # Criterion and comparison
 7306                                if sql_set_option:
 7307
 7308                                    if criterion_mode in ["operation"]:
 7309
 7310                                        try:
 7311                                            float(criterion_value)
 7312                                            sql_update = f"""
 7313                                                UPDATE {table_variants}
 7314                                                SET {sql_set_option}
 7315                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7316                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7317                                            """
 7318                                        except:
 7319                                            contains_option = ""
 7320                                            if criterion_type == "contains":
 7321                                                contains_option = ".*"
 7322                                            sql_update = f"""
 7323                                                UPDATE {table_variants}
 7324                                                SET {sql_set_option}
 7325                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7326                                            """
 7327                                        sql_queries.append(sql_update)
 7328
 7329                                    elif criterion_mode in ["sql"]:
 7330
 7331                                        sql_update = f"""
 7332                                            UPDATE {table_variants}
 7333                                            SET {sql_set_option}
 7334                                            WHERE {criterion_sql}
 7335                                        """
 7336                                        sql_queries.append(sql_update)
 7337
 7338                                    else:
 7339                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7340                                        log.error(msg_err)
 7341                                        raise ValueError(msg_err)
 7342
 7343                                else:
 7344                                    log.warning(
 7345                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7346                                    )
 7347
 7348                        # PZTags
 7349                        if (
 7350                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7351                            in list_of_pzfields
 7352                        ):
 7353
 7354                            # Create PZFalgs value
 7355                            pztags_value = ""
 7356                            pztags_sep_default = ","
 7357                            pztags_sep = ""
 7358                            for pzfield in pzfields:
 7359                                if pzfield not in [f"{pz_prefix}Tags"]:
 7360                                    if (
 7361                                        f"{pzfield}{pzfields_sep}{profile}"
 7362                                        in list_of_pzfields
 7363                                    ):
 7364                                        if pzfield in [f"{pz_prefix}Flag"]:
 7365                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7366                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7367                                                    THEN 'PASS'
 7368                                                    ELSE 'FILTERED'
 7369                                                END, '"""
 7370                                        elif pzfield in [f"{pz_prefix}Class"]:
 7371                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7372                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7373                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7374                                                    ELSE '.'
 7375                                                END, '"""
 7376                                        else:
 7377                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7378                                        pztags_sep = pztags_sep_default
 7379
 7380                            # Add Query update for PZFlags
 7381                            sql_update_pztags = f"""
 7382                                UPDATE {table_variants}
 7383                                SET INFO = concat(
 7384                                        INFO,
 7385                                        CASE WHEN INFO NOT in ('','.')
 7386                                                THEN ';'
 7387                                                ELSE ''
 7388                                        END,
 7389                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7390                                    )
 7391                                """
 7392                            sql_queries.append(sql_update_pztags)
 7393
 7394                            # Add Query update for PZFlags for default
 7395                            if profile == default_profile:
 7396                                sql_update_pztags_default = f"""
 7397                                UPDATE {table_variants}
 7398                                SET INFO = concat(
 7399                                        INFO,
 7400                                        ';',
 7401                                        '{pz_prefix}Tags={pztags_value}'
 7402                                    )
 7403                                """
 7404                                sql_queries.append(sql_update_pztags_default)
 7405
 7406                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7407
 7408                        if sql_queries:
 7409
 7410                            for sql_query in sql_queries:
 7411                                log.debug(
 7412                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7413                                )
 7414                                self.conn.execute(sql_query)
 7415
 7416                        log.info(f"""Profile '{profile}' - Update... """)
 7417                        sql_query_update = f"""
 7418                            UPDATE {table_variants}
 7419                            SET INFO =  
 7420                                concat(
 7421                                    CASE
 7422                                        WHEN INFO NOT IN ('','.')
 7423                                        THEN concat(INFO, ';')
 7424                                        ELSE ''
 7425                                    END
 7426                                    {sql_set_info_option}
 7427                                )
 7428                        """
 7429                        self.conn.execute(sql_query_update)
 7430
 7431        else:
 7432
 7433            log.warning(f"No profiles in parameters")
 7434
 7435        # Remove added columns
 7436        for added_column in added_columns:
 7437            self.drop_column(column=added_column)
 7438
 7439        # Explode INFOS fields into table fields
 7440        if self.get_explode_infos():
 7441            self.explode_infos(
 7442                prefix=self.get_explode_infos_prefix(),
 7443                fields=self.get_explode_infos_fields(),
 7444                force=True,
 7445            )
 7446
 7447        return True
 7448
 7449    ###
 7450    # HGVS
 7451    ###
 7452
 7453    def annotation_hgvs(self, threads: int = None) -> None:
 7454        """
 7455        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7456        coordinates and alleles.
 7457
 7458        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7459        threads to use for parallel processing. If no value is provided, it will default to the number
 7460        of threads obtained from the `get_threads()` method
 7461        :type threads: int
 7462        """
 7463
 7464        # Function for each partition of the Dask Dataframe
 7465        def partition_function(partition):
 7466            """
 7467            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7468            each row of a DataFrame called `partition`.
 7469
 7470            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7471            to be processed
 7472            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7473            the "partition" dataframe along the axis 1.
 7474            """
 7475            return partition.apply(annotation_hgvs_partition, axis=1)
 7476
 7477        def annotation_hgvs_partition(row) -> str:
 7478            """
 7479            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7480            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7481
 7482            :param row: A dictionary-like object that contains the values for the following keys:
 7483            :return: a string that contains the HGVS names associated with the given row of data.
 7484            """
 7485
 7486            chr = row["CHROM"]
 7487            pos = row["POS"]
 7488            ref = row["REF"]
 7489            alt = row["ALT"]
 7490
 7491            # Find list of associated transcripts
 7492            transcripts_list = list(
 7493                polars_conn.execute(
 7494                    f"""
 7495                SELECT transcript
 7496                FROM refseq_df
 7497                WHERE CHROM='{chr}'
 7498                AND POS={pos}
 7499            """
 7500                )["transcript"]
 7501            )
 7502
 7503            # Full HGVS annotation in list
 7504            hgvs_full_list = []
 7505
 7506            for transcript_name in transcripts_list:
 7507
 7508                # Transcript
 7509                transcript = get_transcript(
 7510                    transcripts=transcripts, transcript_name=transcript_name
 7511                )
 7512                # Exon
 7513                if use_exon:
 7514                    exon = transcript.find_exon_number(pos)
 7515                else:
 7516                    exon = None
 7517                # Protein
 7518                transcript_protein = None
 7519                if use_protein or add_protein or full_format:
 7520                    transcripts_protein = list(
 7521                        polars_conn.execute(
 7522                            f"""
 7523                        SELECT protein
 7524                        FROM refseqlink_df
 7525                        WHERE transcript='{transcript_name}'
 7526                        LIMIT 1
 7527                    """
 7528                        )["protein"]
 7529                    )
 7530                    if len(transcripts_protein):
 7531                        transcript_protein = transcripts_protein[0]
 7532
 7533                # HGVS name
 7534                hgvs_name = format_hgvs_name(
 7535                    chr,
 7536                    pos,
 7537                    ref,
 7538                    alt,
 7539                    genome=genome,
 7540                    transcript=transcript,
 7541                    transcript_protein=transcript_protein,
 7542                    exon=exon,
 7543                    use_gene=use_gene,
 7544                    use_protein=use_protein,
 7545                    full_format=full_format,
 7546                    use_version=use_version,
 7547                    codon_type=codon_type,
 7548                )
 7549                hgvs_full_list.append(hgvs_name)
 7550                if add_protein and not use_protein and not full_format:
 7551                    hgvs_name = format_hgvs_name(
 7552                        chr,
 7553                        pos,
 7554                        ref,
 7555                        alt,
 7556                        genome=genome,
 7557                        transcript=transcript,
 7558                        transcript_protein=transcript_protein,
 7559                        exon=exon,
 7560                        use_gene=use_gene,
 7561                        use_protein=True,
 7562                        full_format=False,
 7563                        use_version=use_version,
 7564                        codon_type=codon_type,
 7565                    )
 7566                    hgvs_full_list.append(hgvs_name)
 7567
 7568            # Create liste of HGVS annotations
 7569            hgvs_full = ",".join(hgvs_full_list)
 7570
 7571            return hgvs_full
 7572
 7573        # Polars connexion
 7574        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7575
 7576        # Config
 7577        config = self.get_config()
 7578
 7579        # Databases
 7580        # Genome
 7581        databases_genomes_folders = (
 7582            config.get("folders", {})
 7583            .get("databases", {})
 7584            .get("genomes", DEFAULT_GENOME_FOLDER)
 7585        )
 7586        databases_genome = (
 7587            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7588        )
 7589        # refseq database folder
 7590        databases_refseq_folders = (
 7591            config.get("folders", {})
 7592            .get("databases", {})
 7593            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7594        )
 7595        # refseq
 7596        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7597        # refSeqLink
 7598        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7599
 7600        # Param
 7601        param = self.get_param()
 7602
 7603        # Quick HGVS
 7604        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7605            log.info(f"Quick HGVS Annotation:")
 7606            if not param.get("hgvs", None):
 7607                param["hgvs"] = {}
 7608            for option in param.get("hgvs_options", "").split(","):
 7609                option_var_val = option.split("=")
 7610                option_var = option_var_val[0]
 7611                if len(option_var_val) > 1:
 7612                    option_val = option_var_val[1]
 7613                else:
 7614                    option_val = "True"
 7615                if option_val.upper() in ["TRUE"]:
 7616                    option_val = True
 7617                elif option_val.upper() in ["FALSE"]:
 7618                    option_val = False
 7619                log.info(f"   {option_var}={option_val}")
 7620                param["hgvs"][option_var] = option_val
 7621
 7622        # Check if HGVS annotation enabled
 7623        if "hgvs" in param:
 7624            log.info(f"HGVS Annotation... ")
 7625            for hgvs_option in param.get("hgvs", {}):
 7626                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7627        else:
 7628            return
 7629
 7630        # HGVS Param
 7631        param_hgvs = param.get("hgvs", {})
 7632        use_exon = param_hgvs.get("use_exon", False)
 7633        use_gene = param_hgvs.get("use_gene", False)
 7634        use_protein = param_hgvs.get("use_protein", False)
 7635        add_protein = param_hgvs.get("add_protein", False)
 7636        full_format = param_hgvs.get("full_format", False)
 7637        use_version = param_hgvs.get("use_version", False)
 7638        codon_type = param_hgvs.get("codon_type", "3")
 7639
 7640        # refSseq refSeqLink
 7641        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7642        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7643
 7644        # Assembly
 7645        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7646
 7647        # Genome
 7648        genome_file = None
 7649        if find_genome(databases_genome):
 7650            genome_file = find_genome(databases_genome)
 7651        else:
 7652            genome_file = find_genome(
 7653                genome_path=databases_genomes_folders, assembly=assembly
 7654            )
 7655        log.debug("Genome: " + str(genome_file))
 7656
 7657        # refSseq
 7658        refseq_file = find_file_prefix(
 7659            input_file=databases_refseq,
 7660            prefix="ncbiRefSeq",
 7661            folder=databases_refseq_folders,
 7662            assembly=assembly,
 7663        )
 7664        log.debug("refSeq: " + str(refseq_file))
 7665
 7666        # refSeqLink
 7667        refseqlink_file = find_file_prefix(
 7668            input_file=databases_refseqlink,
 7669            prefix="ncbiRefSeqLink",
 7670            folder=databases_refseq_folders,
 7671            assembly=assembly,
 7672        )
 7673        log.debug("refSeqLink: " + str(refseqlink_file))
 7674
 7675        # Threads
 7676        if not threads:
 7677            threads = self.get_threads()
 7678        log.debug("Threads: " + str(threads))
 7679
 7680        # Variables
 7681        table_variants = self.get_table_variants(clause="update")
 7682
 7683        # Get variants SNV and InDel only
 7684        query_variants = f"""
 7685            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7686            FROM {table_variants}
 7687            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7688            """
 7689        df_variants = self.get_query_to_df(query_variants)
 7690
 7691        # Added columns
 7692        added_columns = []
 7693
 7694        # Add hgvs column in variants table
 7695        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7696        added_column = self.add_column(
 7697            table_variants, hgvs_column_name, "STRING", default_value=None
 7698        )
 7699        added_columns.append(added_column)
 7700
 7701        log.debug(f"refSeq loading...")
 7702        # refSeq in duckDB
 7703        refseq_table = get_refseq_table(
 7704            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7705        )
 7706        # Loading all refSeq in Dataframe
 7707        refseq_query = f"""
 7708            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7709            FROM {refseq_table}
 7710            JOIN df_variants ON (
 7711                {refseq_table}.chrom = df_variants.CHROM
 7712                AND {refseq_table}.txStart<=df_variants.POS
 7713                AND {refseq_table}.txEnd>=df_variants.POS
 7714            )
 7715        """
 7716        refseq_df = self.conn.query(refseq_query).pl()
 7717
 7718        if refseqlink_file:
 7719            log.debug(f"refSeqLink loading...")
 7720            # refSeqLink in duckDB
 7721            refseqlink_table = get_refseq_table(
 7722                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7723            )
 7724            # Loading all refSeqLink in Dataframe
 7725            protacc_column = "protAcc_with_ver"
 7726            mrnaacc_column = "mrnaAcc_with_ver"
 7727            refseqlink_query = f"""
 7728                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7729                FROM {refseqlink_table} 
 7730                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7731                WHERE protAcc_without_ver IS NOT NULL
 7732            """
 7733            # Polars Dataframe
 7734            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7735
 7736        # Read RefSeq transcripts into a python dict/model.
 7737        log.debug(f"Transcripts loading...")
 7738        with tempfile.TemporaryDirectory() as tmpdir:
 7739            transcripts_query = f"""
 7740                COPY (
 7741                    SELECT {refseq_table}.*
 7742                    FROM {refseq_table}
 7743                    JOIN df_variants ON (
 7744                        {refseq_table}.chrom=df_variants.CHROM
 7745                        AND {refseq_table}.txStart<=df_variants.POS
 7746                        AND {refseq_table}.txEnd>=df_variants.POS
 7747                    )
 7748                )
 7749                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7750            """
 7751            self.conn.query(transcripts_query)
 7752            with open(f"{tmpdir}/transcript.tsv") as infile:
 7753                transcripts = read_transcripts(infile)
 7754
 7755        # Polars connexion
 7756        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7757
 7758        log.debug("Genome loading...")
 7759        # Read genome sequence using pyfaidx.
 7760        genome = Fasta(genome_file)
 7761
 7762        log.debug("Start annotation HGVS...")
 7763
 7764        # Create
 7765        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7766        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7767
 7768        # Use dask.dataframe.apply() to apply function on each partition
 7769        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7770
 7771        # Convert Dask DataFrame to Pandas Dataframe
 7772        df = ddf.compute()
 7773
 7774        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7775        with tempfile.TemporaryDirectory() as tmpdir:
 7776            df_parquet = os.path.join(tmpdir, "df.parquet")
 7777            df.to_parquet(df_parquet)
 7778
 7779            # Update hgvs column
 7780            update_variant_query = f"""
 7781                UPDATE {table_variants}
 7782                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7783                FROM read_parquet('{df_parquet}') as df
 7784                WHERE variants."#CHROM" = df.CHROM
 7785                AND variants.POS = df.POS
 7786                AND variants.REF = df.REF
 7787                AND variants.ALT = df.ALT
 7788                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7789                """
 7790            self.execute_query(update_variant_query)
 7791
 7792        # Update INFO column
 7793        sql_query_update = f"""
 7794            UPDATE {table_variants}
 7795            SET INFO = 
 7796                concat(
 7797                    CASE 
 7798                        WHEN INFO NOT IN ('','.')
 7799                        THEN concat(INFO, ';')
 7800                        ELSE ''
 7801                    END,
 7802                    'hgvs=',
 7803                    {hgvs_column_name}
 7804                )
 7805            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7806            """
 7807        self.execute_query(sql_query_update)
 7808
 7809        # Add header
 7810        HGVS_INFOS = {
 7811            "hgvs": {
 7812                "ID": "hgvs",
 7813                "Number": ".",
 7814                "Type": "String",
 7815                "Description": f"HGVS annotatation with HOWARD",
 7816            }
 7817        }
 7818
 7819        for field in HGVS_INFOS:
 7820            field_ID = HGVS_INFOS[field]["ID"]
 7821            field_description = HGVS_INFOS[field]["Description"]
 7822            self.get_header().infos[field_ID] = vcf.parser._Info(
 7823                field_ID,
 7824                HGVS_INFOS[field]["Number"],
 7825                HGVS_INFOS[field]["Type"],
 7826                field_description,
 7827                "unknown",
 7828                "unknown",
 7829                code_type_map[HGVS_INFOS[field]["Type"]],
 7830            )
 7831
 7832        # Remove added columns
 7833        for added_column in added_columns:
 7834            self.drop_column(column=added_column)
 7835
 7836    ###
 7837    # Calculation
 7838    ###
 7839
 7840    def get_operations_help(
 7841        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7842    ) -> list:
 7843
 7844        # Init
 7845        operations_help = []
 7846
 7847        # operations
 7848        operations = self.get_config_json(
 7849            name="calculations",
 7850            config_dict=operations_config_dict,
 7851            config_file=operations_config_file,
 7852        )
 7853        for op in operations:
 7854            op_name = operations[op].get("name", op).upper()
 7855            op_description = operations[op].get("description", op_name)
 7856            op_available = operations[op].get("available", False)
 7857            if op_available:
 7858                operations_help.append(f"   {op_name}: {op_description}")
 7859
 7860        # Sort operations
 7861        operations_help.sort()
 7862
 7863        # insert header
 7864        operations_help.insert(0, "Available calculation operations:")
 7865
 7866        # Return
 7867        return operations_help
 7868
 7869    def calculation(
 7870        self,
 7871        operations: dict = {},
 7872        operations_config_dict: dict = {},
 7873        operations_config_file: str = None,
 7874    ) -> None:
 7875        """
 7876        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7877        operation, and then calls the appropriate function
 7878
 7879        param json example:
 7880            "calculation": {
 7881                "NOMEN": {
 7882                    "options": {
 7883                        "hgvs_field": "hgvs"
 7884                    },
 7885                "middle" : null
 7886            }
 7887        """
 7888
 7889        # Param
 7890        param = self.get_param()
 7891
 7892        # operations config
 7893        operations_config = self.get_config_json(
 7894            name="calculations",
 7895            config_dict=operations_config_dict,
 7896            config_file=operations_config_file,
 7897        )
 7898
 7899        # Upper keys
 7900        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7901
 7902        # Calculations
 7903
 7904        # Operations from param
 7905        operations = param.get("calculation", {}).get("calculations", operations)
 7906
 7907        # Quick calculation - add
 7908        if param.get("calculations", None):
 7909            calculations_list = [
 7910                value for value in param.get("calculations", "").split(",")
 7911            ]
 7912            log.info(f"Quick Calculations:")
 7913            for calculation_key in calculations_list:
 7914                log.info(f"   {calculation_key}")
 7915            for calculation_operation in calculations_list:
 7916                if calculation_operation.upper() not in operations:
 7917                    operations[calculation_operation.upper()] = {}
 7918                    add_value_into_dict(
 7919                        dict_tree=param,
 7920                        sections=[
 7921                            "calculation",
 7922                            "calculations",
 7923                            calculation_operation.upper(),
 7924                        ],
 7925                        value={},
 7926                    )
 7927
 7928        # Operations for calculation
 7929        if not operations:
 7930            operations = param.get("calculation", {}).get("calculations", {})
 7931
 7932        if operations:
 7933            log.info(f"Calculations...")
 7934
 7935        # For each operations
 7936        for operation_name in operations:
 7937            operation_name = operation_name.upper()
 7938            if operation_name not in [""]:
 7939                if operation_name in operations_config:
 7940                    log.info(f"Calculation '{operation_name}'")
 7941                    operation = operations_config[operation_name]
 7942                    operation_type = operation.get("type", "sql")
 7943                    if operation_type == "python":
 7944                        self.calculation_process_function(
 7945                            operation=operation, operation_name=operation_name
 7946                        )
 7947                    elif operation_type == "sql":
 7948                        self.calculation_process_sql(
 7949                            operation=operation, operation_name=operation_name
 7950                        )
 7951                    else:
 7952                        log.error(
 7953                            f"Operations config: Type '{operation_type}' NOT available"
 7954                        )
 7955                        raise ValueError(
 7956                            f"Operations config: Type '{operation_type}' NOT available"
 7957                        )
 7958                else:
 7959                    log.error(
 7960                        f"Operations config: Calculation '{operation_name}' NOT available"
 7961                    )
 7962                    raise ValueError(
 7963                        f"Operations config: Calculation '{operation_name}' NOT available"
 7964                    )
 7965
 7966        # Explode INFOS fields into table fields
 7967        if self.get_explode_infos():
 7968            self.explode_infos(
 7969                prefix=self.get_explode_infos_prefix(),
 7970                fields=self.get_explode_infos_fields(),
 7971                force=True,
 7972            )
 7973
 7974    def calculation_process_sql(
 7975        self, operation: dict, operation_name: str = "unknown"
 7976    ) -> None:
 7977        """
 7978        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7979        performs the operation, updating the specified table with the result.
 7980
 7981        :param operation: The `operation` parameter is a dictionary that contains information about the
 7982        mathematical operation to be performed. It includes the following keys:
 7983        :type operation: dict
 7984        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7985        the mathematical operation being performed. It is used for logging and error handling purposes,
 7986        defaults to unknown
 7987        :type operation_name: str (optional)
 7988        """
 7989
 7990        # table variants
 7991        table_variants = self.get_table_variants(clause="alter")
 7992
 7993        # Operation infos
 7994        operation_name = operation.get("name", "unknown")
 7995        log.debug(f"process sql {operation_name}")
 7996        output_column_name = operation.get("output_column_name", operation_name)
 7997        output_column_type = operation.get("output_column_type", "String")
 7998        prefix = operation.get("explode_infos_prefix", "")
 7999        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8000        output_column_description = operation.get(
 8001            "output_column_description", f"{operation_name} operation"
 8002        )
 8003        operation_query = operation.get("operation_query", None)
 8004        if isinstance(operation_query, list):
 8005            operation_query = " ".join(operation_query)
 8006        operation_info_fields = operation.get("info_fields", [])
 8007        operation_info_fields_check = operation.get("info_fields_check", False)
 8008        operation_info = operation.get("operation_info", True)
 8009
 8010        if operation_query:
 8011
 8012            # Info fields check
 8013            operation_info_fields_check_result = True
 8014            if operation_info_fields_check:
 8015                header_infos = self.get_header().infos
 8016                for info_field in operation_info_fields:
 8017                    operation_info_fields_check_result = (
 8018                        operation_info_fields_check_result
 8019                        and info_field in header_infos
 8020                    )
 8021
 8022            # If info fields available
 8023            if operation_info_fields_check_result:
 8024
 8025                # Added_columns
 8026                added_columns = []
 8027
 8028                # Create VCF header field
 8029                vcf_reader = self.get_header()
 8030                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8031                    output_column_name,
 8032                    ".",
 8033                    output_column_type,
 8034                    output_column_description,
 8035                    "howard calculation",
 8036                    "0",
 8037                    self.code_type_map.get(output_column_type),
 8038                )
 8039
 8040                # Explode infos if needed
 8041                log.debug(f"calculation_process_sql prefix {prefix}")
 8042                added_columns += self.explode_infos(
 8043                    prefix=prefix,
 8044                    fields=[output_column_name] + operation_info_fields,
 8045                    force=True,
 8046                )
 8047
 8048                # Create column
 8049                added_column = self.add_column(
 8050                    table_name=table_variants,
 8051                    column_name=prefix + output_column_name,
 8052                    column_type=output_column_type_sql,
 8053                    default_value="null",
 8054                )
 8055                added_columns.append(added_column)
 8056
 8057                # Operation calculation
 8058                try:
 8059
 8060                    # Query to update calculation column
 8061                    sql_update = f"""
 8062                        UPDATE {table_variants}
 8063                        SET "{prefix}{output_column_name}" = ({operation_query})
 8064                    """
 8065                    self.conn.execute(sql_update)
 8066
 8067                    # Add to INFO
 8068                    if operation_info:
 8069                        sql_update_info = f"""
 8070                            UPDATE {table_variants}
 8071                            SET "INFO" =
 8072                                concat(
 8073                                    CASE
 8074                                        WHEN "INFO" IS NOT NULL
 8075                                        THEN concat("INFO", ';')
 8076                                        ELSE ''
 8077                                    END,
 8078                                    '{output_column_name}=',
 8079                                    "{prefix}{output_column_name}"
 8080                                )
 8081                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8082                        """
 8083                        self.conn.execute(sql_update_info)
 8084
 8085                except:
 8086                    log.error(
 8087                        f"Operations config: Calculation '{operation_name}' query failed"
 8088                    )
 8089                    raise ValueError(
 8090                        f"Operations config: Calculation '{operation_name}' query failed"
 8091                    )
 8092
 8093                # Remove added columns
 8094                for added_column in added_columns:
 8095                    log.debug(f"added_column: {added_column}")
 8096                    self.drop_column(column=added_column)
 8097
 8098            else:
 8099                log.error(
 8100                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8101                )
 8102                raise ValueError(
 8103                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8104                )
 8105
 8106        else:
 8107            log.error(
 8108                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8109            )
 8110            raise ValueError(
 8111                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8112            )
 8113
 8114    def calculation_process_function(
 8115        self, operation: dict, operation_name: str = "unknown"
 8116    ) -> None:
 8117        """
 8118        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8119        function with the given parameters.
 8120
 8121        :param operation: The `operation` parameter is a dictionary that contains information about the
 8122        operation to be performed. It has the following keys:
 8123        :type operation: dict
 8124        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8125        the operation being performed. It is used for logging purposes, defaults to unknown
 8126        :type operation_name: str (optional)
 8127        """
 8128
 8129        operation_name = operation["name"]
 8130        log.debug(f"process sql {operation_name}")
 8131        function_name = operation["function_name"]
 8132        function_params = operation["function_params"]
 8133        getattr(self, function_name)(*function_params)
 8134
 8135    def calculation_variant_id(self) -> None:
 8136        """
 8137        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8138        updates the INFO field of a variants table with the variant ID.
 8139        """
 8140
 8141        # variant_id annotation field
 8142        variant_id_tag = self.get_variant_id_column()
 8143        added_columns = [variant_id_tag]
 8144
 8145        # variant_id hgvs tags"
 8146        vcf_infos_tags = {
 8147            variant_id_tag: "howard variant ID annotation",
 8148        }
 8149
 8150        # Variants table
 8151        table_variants = self.get_table_variants()
 8152
 8153        # Header
 8154        vcf_reader = self.get_header()
 8155
 8156        # Add variant_id to header
 8157        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8158            variant_id_tag,
 8159            ".",
 8160            "String",
 8161            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8162            "howard calculation",
 8163            "0",
 8164            self.code_type_map.get("String"),
 8165        )
 8166
 8167        # Update
 8168        sql_update = f"""
 8169            UPDATE {table_variants}
 8170            SET "INFO" = 
 8171                concat(
 8172                    CASE
 8173                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8174                        THEN ''
 8175                        ELSE concat("INFO", ';')
 8176                    END,
 8177                    '{variant_id_tag}=',
 8178                    "{variant_id_tag}"
 8179                )
 8180        """
 8181        self.conn.execute(sql_update)
 8182
 8183        # Remove added columns
 8184        for added_column in added_columns:
 8185            self.drop_column(column=added_column)
 8186
 8187    def calculation_extract_snpeff_hgvs(
 8188        self,
 8189        snpeff_hgvs: str = "snpeff_hgvs",
 8190        snpeff_field: str = "ANN",
 8191    ) -> None:
 8192        """
 8193        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8194        annotation field in a VCF file and adds them as a new column in the variants table.
 8195
 8196        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8197        function is used to specify the name of the column that will store the HGVS nomenclatures
 8198        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8199        snpeff_hgvs
 8200        :type snpeff_hgvs: str (optional)
 8201        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8202        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8203        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8204        to ANN
 8205        :type snpeff_field: str (optional)
 8206        """
 8207
 8208        # Snpeff hgvs tags
 8209        vcf_infos_tags = {
 8210            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8211        }
 8212
 8213        # Prefix
 8214        prefix = self.get_explode_infos_prefix()
 8215        if prefix:
 8216            prefix = "INFO/"
 8217
 8218        # snpEff fields
 8219        speff_ann_infos = prefix + snpeff_field
 8220        speff_hgvs_infos = prefix + snpeff_hgvs
 8221
 8222        # Variants table
 8223        table_variants = self.get_table_variants()
 8224
 8225        # Header
 8226        vcf_reader = self.get_header()
 8227
 8228        # Add columns
 8229        added_columns = []
 8230
 8231        # Explode HGVS field in column
 8232        added_columns += self.explode_infos(fields=[snpeff_field])
 8233
 8234        if snpeff_field in vcf_reader.infos:
 8235
 8236            log.debug(vcf_reader.infos[snpeff_field])
 8237
 8238            # Extract ANN header
 8239            ann_description = vcf_reader.infos[snpeff_field].desc
 8240            pattern = r"'(.+?)'"
 8241            match = re.search(pattern, ann_description)
 8242            if match:
 8243                ann_header_match = match.group(1).split(" | ")
 8244                ann_header_desc = {}
 8245                for i in range(len(ann_header_match)):
 8246                    ann_header_info = "".join(
 8247                        char for char in ann_header_match[i] if char.isalnum()
 8248                    )
 8249                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8250                if not ann_header_desc:
 8251                    raise ValueError("Invalid header description format")
 8252            else:
 8253                raise ValueError("Invalid header description format")
 8254
 8255            # Create variant id
 8256            variant_id_column = self.get_variant_id_column()
 8257            added_columns += [variant_id_column]
 8258
 8259            # Create dataframe
 8260            dataframe_snpeff_hgvs = self.get_query_to_df(
 8261                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8262            )
 8263
 8264            # Create main NOMEN column
 8265            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8266                speff_ann_infos
 8267            ].apply(
 8268                lambda x: extract_snpeff_hgvs(
 8269                    str(x), header=list(ann_header_desc.values())
 8270                )
 8271            )
 8272
 8273            # Add snpeff_hgvs to header
 8274            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8275                snpeff_hgvs,
 8276                ".",
 8277                "String",
 8278                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8279                "howard calculation",
 8280                "0",
 8281                self.code_type_map.get("String"),
 8282            )
 8283
 8284            # Update
 8285            sql_update = f"""
 8286                UPDATE variants
 8287                SET "INFO" = 
 8288                    concat(
 8289                        CASE
 8290                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8291                            THEN ''
 8292                            ELSE concat("INFO", ';')
 8293                        END,
 8294                        CASE 
 8295                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8296                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8297                            THEN concat(
 8298                                    '{snpeff_hgvs}=',
 8299                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8300                                )
 8301                            ELSE ''
 8302                        END
 8303                    )
 8304                FROM dataframe_snpeff_hgvs
 8305                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8306
 8307            """
 8308            self.conn.execute(sql_update)
 8309
 8310            # Delete dataframe
 8311            del dataframe_snpeff_hgvs
 8312            gc.collect()
 8313
 8314        else:
 8315
 8316            log.warning(
 8317                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8318            )
 8319
 8320        # Remove added columns
 8321        for added_column in added_columns:
 8322            self.drop_column(column=added_column)
 8323
 8324    def calculation_snpeff_ann_explode(
 8325        self,
 8326        uniquify: bool = True,
 8327        output_format: str = "fields",
 8328        output_prefix: str = "snpeff_",
 8329        snpeff_field: str = "ANN",
 8330    ) -> None:
 8331        """
 8332        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8333        exploding the HGVS field and updating variant information accordingly.
 8334
 8335        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8336        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8337        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8338        defaults to True
 8339        :type uniquify: bool (optional)
 8340        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8341        function specifies the format in which the output annotations will be generated. It has a
 8342        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8343        format, defaults to fields
 8344        :type output_format: str (optional)
 8345        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8346        method is used to specify the prefix that will be added to the output annotations generated
 8347        during the calculation process. This prefix helps to differentiate the newly added annotations
 8348        from existing ones in the output data. By default, the, defaults to ANN_
 8349        :type output_prefix: str (optional)
 8350        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8351        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8352        field will be processed to explode the HGVS annotations and update the variant information
 8353        accordingly, defaults to ANN
 8354        :type snpeff_field: str (optional)
 8355        """
 8356
 8357        # SnpEff annotation field
 8358        snpeff_hgvs = "snpeff_ann_explode"
 8359
 8360        # Snpeff hgvs tags
 8361        vcf_infos_tags = {
 8362            snpeff_hgvs: "Explode snpEff annotations",
 8363        }
 8364
 8365        # Prefix
 8366        prefix = self.get_explode_infos_prefix()
 8367        if prefix:
 8368            prefix = "INFO/"
 8369
 8370        # snpEff fields
 8371        speff_ann_infos = prefix + snpeff_field
 8372        speff_hgvs_infos = prefix + snpeff_hgvs
 8373
 8374        # Variants table
 8375        table_variants = self.get_table_variants()
 8376
 8377        # Header
 8378        vcf_reader = self.get_header()
 8379
 8380        # Add columns
 8381        added_columns = []
 8382
 8383        # Explode HGVS field in column
 8384        added_columns += self.explode_infos(fields=[snpeff_field])
 8385        log.debug(f"snpeff_field={snpeff_field}")
 8386        log.debug(f"added_columns={added_columns}")
 8387
 8388        if snpeff_field in vcf_reader.infos:
 8389
 8390            # Extract ANN header
 8391            ann_description = vcf_reader.infos[snpeff_field].desc
 8392            pattern = r"'(.+?)'"
 8393            match = re.search(pattern, ann_description)
 8394            if match:
 8395                ann_header_match = match.group(1).split(" | ")
 8396                ann_header = []
 8397                ann_header_desc = {}
 8398                for i in range(len(ann_header_match)):
 8399                    ann_header_info = "".join(
 8400                        char for char in ann_header_match[i] if char.isalnum()
 8401                    )
 8402                    ann_header.append(ann_header_info)
 8403                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8404                if not ann_header_desc:
 8405                    raise ValueError("Invalid header description format")
 8406            else:
 8407                raise ValueError("Invalid header description format")
 8408
 8409            # Create variant id
 8410            variant_id_column = self.get_variant_id_column()
 8411            added_columns += [variant_id_column]
 8412
 8413            # Create dataframe
 8414            dataframe_snpeff_hgvs = self.get_query_to_df(
 8415                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8416            )
 8417
 8418            # Create snpEff columns
 8419            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8420                speff_ann_infos
 8421            ].apply(
 8422                lambda x: explode_snpeff_ann(
 8423                    str(x),
 8424                    uniquify=uniquify,
 8425                    output_format=output_format,
 8426                    prefix=output_prefix,
 8427                    header=list(ann_header_desc.values()),
 8428                )
 8429            )
 8430
 8431            # Header
 8432            ann_annotations_prefix = ""
 8433            if output_format.upper() in ["JSON"]:
 8434                ann_annotations_prefix = f"{output_prefix}="
 8435                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8436                    output_prefix,
 8437                    ".",
 8438                    "String",
 8439                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8440                    + " - JSON format",
 8441                    "howard calculation",
 8442                    "0",
 8443                    self.code_type_map.get("String"),
 8444                )
 8445            else:
 8446                for ann_annotation in ann_header:
 8447                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8448                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8449                        ann_annotation_id,
 8450                        ".",
 8451                        "String",
 8452                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8453                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8454                        "howard calculation",
 8455                        "0",
 8456                        self.code_type_map.get("String"),
 8457                    )
 8458
 8459            # Update
 8460            sql_update = f"""
 8461                UPDATE variants
 8462                SET "INFO" = 
 8463                    concat(
 8464                        CASE
 8465                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8466                            THEN ''
 8467                            ELSE concat("INFO", ';')
 8468                        END,
 8469                        CASE 
 8470                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8471                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8472                            THEN concat(
 8473                                '{ann_annotations_prefix}',
 8474                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8475                                )
 8476                            ELSE ''
 8477                        END
 8478                    )
 8479                FROM dataframe_snpeff_hgvs
 8480                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8481
 8482            """
 8483            self.conn.execute(sql_update)
 8484
 8485            # Delete dataframe
 8486            del dataframe_snpeff_hgvs
 8487            gc.collect()
 8488
 8489        else:
 8490
 8491            log.warning(
 8492                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8493            )
 8494
 8495        # Remove added columns
 8496        for added_column in added_columns:
 8497            self.drop_column(column=added_column)
 8498
 8499    def calculation_extract_nomen(self) -> None:
 8500        """
 8501        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8502        """
 8503
 8504        # NOMEN field
 8505        field_nomen_dict = "NOMEN_DICT"
 8506
 8507        # NOMEN structure
 8508        nomen_dict = {
 8509            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8510            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8511            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8512            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8513            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8514            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8515            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8516            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8517            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8518            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8519        }
 8520
 8521        # Param
 8522        param = self.get_param()
 8523
 8524        # Prefix
 8525        prefix = self.get_explode_infos_prefix()
 8526
 8527        # Header
 8528        vcf_reader = self.get_header()
 8529
 8530        # Get HGVS field
 8531        hgvs_field = (
 8532            param.get("calculation", {})
 8533            .get("calculations", {})
 8534            .get("NOMEN", {})
 8535            .get("options", {})
 8536            .get("hgvs_field", "hgvs")
 8537        )
 8538
 8539        # Get transcripts
 8540        transcripts_file = (
 8541            param.get("calculation", {})
 8542            .get("calculations", {})
 8543            .get("NOMEN", {})
 8544            .get("options", {})
 8545            .get("transcripts", None)
 8546        )
 8547        transcripts_file = full_path(transcripts_file)
 8548        transcripts = []
 8549        if transcripts_file:
 8550            if os.path.exists(transcripts_file):
 8551                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8552                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8553            else:
 8554                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8555                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8556
 8557        # Added columns
 8558        added_columns = []
 8559
 8560        # Explode HGVS field in column
 8561        added_columns += self.explode_infos(fields=[hgvs_field])
 8562
 8563        # extra infos
 8564        extra_infos = self.get_extra_infos()
 8565        extra_field = prefix + hgvs_field
 8566
 8567        if extra_field in extra_infos:
 8568
 8569            # Create dataframe
 8570            dataframe_hgvs = self.get_query_to_df(
 8571                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8572            )
 8573
 8574            # Create main NOMEN column
 8575            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8576                lambda x: find_nomen(str(x), transcripts=transcripts)
 8577            )
 8578
 8579            # Explode NOMEN Structure and create SQL set for update
 8580            sql_nomen_fields = []
 8581            for nomen_field in nomen_dict:
 8582
 8583                # Explode each field into a column
 8584                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8585                    lambda x: dict(x).get(nomen_field, "")
 8586                )
 8587
 8588                # Create VCF header field
 8589                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8590                    nomen_field,
 8591                    ".",
 8592                    "String",
 8593                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8594                    "howard calculation",
 8595                    "0",
 8596                    self.code_type_map.get("String"),
 8597                )
 8598                sql_nomen_fields.append(
 8599                    f"""
 8600                        CASE 
 8601                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8602                            THEN concat(
 8603                                    ';{nomen_field}=',
 8604                                    dataframe_hgvs."{nomen_field}"
 8605                                )
 8606                            ELSE ''
 8607                        END
 8608                    """
 8609                )
 8610
 8611            # SQL set for update
 8612            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8613
 8614            # Update
 8615            sql_update = f"""
 8616                UPDATE variants
 8617                SET "INFO" = 
 8618                    concat(
 8619                        CASE
 8620                            WHEN "INFO" IS NULL
 8621                            THEN ''
 8622                            ELSE "INFO"
 8623                        END,
 8624                        {sql_nomen_fields_set}
 8625                    )
 8626                FROM dataframe_hgvs
 8627                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8628                    AND variants."POS" = dataframe_hgvs."POS" 
 8629                    AND variants."REF" = dataframe_hgvs."REF"
 8630                    AND variants."ALT" = dataframe_hgvs."ALT"
 8631            """
 8632            self.conn.execute(sql_update)
 8633
 8634            # Delete dataframe
 8635            del dataframe_hgvs
 8636            gc.collect()
 8637
 8638        # Remove added columns
 8639        for added_column in added_columns:
 8640            self.drop_column(column=added_column)
 8641
 8642    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8643        """
 8644        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8645        pipeline/sample for a variant and updates the variant information in a VCF file.
 8646
 8647        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8648        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8649        VCF header and to update the corresponding field in the variants table, defaults to
 8650        findbypipeline
 8651        :type tag: str (optional)
 8652        """
 8653
 8654        # if FORMAT and samples
 8655        if (
 8656            "FORMAT" in self.get_header_columns_as_list()
 8657            and self.get_header_sample_list()
 8658        ):
 8659
 8660            # findbypipeline annotation field
 8661            findbypipeline_tag = tag
 8662
 8663            # VCF infos tags
 8664            vcf_infos_tags = {
 8665                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8666            }
 8667
 8668            # Prefix
 8669            prefix = self.get_explode_infos_prefix()
 8670
 8671            # Field
 8672            findbypipeline_infos = prefix + findbypipeline_tag
 8673
 8674            # Variants table
 8675            table_variants = self.get_table_variants()
 8676
 8677            # Header
 8678            vcf_reader = self.get_header()
 8679
 8680            # Create variant id
 8681            variant_id_column = self.get_variant_id_column()
 8682            added_columns = [variant_id_column]
 8683
 8684            # variant_id, FORMAT and samples
 8685            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8686                self.get_header_sample_list()
 8687            )
 8688
 8689            # Create dataframe
 8690            dataframe_findbypipeline = self.get_query_to_df(
 8691                f""" SELECT {samples_fields} FROM {table_variants} """
 8692            )
 8693
 8694            # Create findbypipeline column
 8695            dataframe_findbypipeline[findbypipeline_infos] = (
 8696                dataframe_findbypipeline.apply(
 8697                    lambda row: findbypipeline(
 8698                        row, samples=self.get_header_sample_list()
 8699                    ),
 8700                    axis=1,
 8701                )
 8702            )
 8703
 8704            # Add snpeff_hgvs to header
 8705            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8706                findbypipeline_tag,
 8707                ".",
 8708                "String",
 8709                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8710                "howard calculation",
 8711                "0",
 8712                self.code_type_map.get("String"),
 8713            )
 8714
 8715            # Update
 8716            sql_update = f"""
 8717                UPDATE variants
 8718                SET "INFO" = 
 8719                    concat(
 8720                        CASE
 8721                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8722                            THEN ''
 8723                            ELSE concat("INFO", ';')
 8724                        END,
 8725                        CASE 
 8726                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8727                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8728                            THEN concat(
 8729                                    '{findbypipeline_tag}=',
 8730                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8731                                )
 8732                            ELSE ''
 8733                        END
 8734                    )
 8735                FROM dataframe_findbypipeline
 8736                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8737            """
 8738            self.conn.execute(sql_update)
 8739
 8740            # Remove added columns
 8741            for added_column in added_columns:
 8742                self.drop_column(column=added_column)
 8743
 8744            # Delete dataframe
 8745            del dataframe_findbypipeline
 8746            gc.collect()
 8747
 8748    def calculation_genotype_concordance(self) -> None:
 8749        """
 8750        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8751        multi-caller VCF files and updates the variant information in the database.
 8752        """
 8753
 8754        # if FORMAT and samples
 8755        if (
 8756            "FORMAT" in self.get_header_columns_as_list()
 8757            and self.get_header_sample_list()
 8758        ):
 8759
 8760            # genotypeconcordance annotation field
 8761            genotypeconcordance_tag = "genotypeconcordance"
 8762
 8763            # VCF infos tags
 8764            vcf_infos_tags = {
 8765                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8766            }
 8767
 8768            # Prefix
 8769            prefix = self.get_explode_infos_prefix()
 8770
 8771            # Field
 8772            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8773
 8774            # Variants table
 8775            table_variants = self.get_table_variants()
 8776
 8777            # Header
 8778            vcf_reader = self.get_header()
 8779
 8780            # Create variant id
 8781            variant_id_column = self.get_variant_id_column()
 8782            added_columns = [variant_id_column]
 8783
 8784            # variant_id, FORMAT and samples
 8785            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8786                self.get_header_sample_list()
 8787            )
 8788
 8789            # Create dataframe
 8790            dataframe_genotypeconcordance = self.get_query_to_df(
 8791                f""" SELECT {samples_fields} FROM {table_variants} """
 8792            )
 8793
 8794            # Create genotypeconcordance column
 8795            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8796                dataframe_genotypeconcordance.apply(
 8797                    lambda row: genotypeconcordance(
 8798                        row, samples=self.get_header_sample_list()
 8799                    ),
 8800                    axis=1,
 8801                )
 8802            )
 8803
 8804            # Add genotypeconcordance to header
 8805            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8806                genotypeconcordance_tag,
 8807                ".",
 8808                "String",
 8809                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8810                "howard calculation",
 8811                "0",
 8812                self.code_type_map.get("String"),
 8813            )
 8814
 8815            # Update
 8816            sql_update = f"""
 8817                UPDATE variants
 8818                SET "INFO" = 
 8819                    concat(
 8820                        CASE
 8821                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8822                            THEN ''
 8823                            ELSE concat("INFO", ';')
 8824                        END,
 8825                        CASE
 8826                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8827                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8828                            THEN concat(
 8829                                    '{genotypeconcordance_tag}=',
 8830                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8831                                )
 8832                            ELSE ''
 8833                        END
 8834                    )
 8835                FROM dataframe_genotypeconcordance
 8836                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8837            """
 8838            self.conn.execute(sql_update)
 8839
 8840            # Remove added columns
 8841            for added_column in added_columns:
 8842                self.drop_column(column=added_column)
 8843
 8844            # Delete dataframe
 8845            del dataframe_genotypeconcordance
 8846            gc.collect()
 8847
 8848    def calculation_barcode(self, tag: str = "barcode") -> None:
 8849        """
 8850        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8851        updates the INFO field in the file with the calculated barcode values.
 8852
 8853        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8854        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8855        the default tag name is set to "barcode", defaults to barcode
 8856        :type tag: str (optional)
 8857        """
 8858
 8859        # if FORMAT and samples
 8860        if (
 8861            "FORMAT" in self.get_header_columns_as_list()
 8862            and self.get_header_sample_list()
 8863        ):
 8864
 8865            # barcode annotation field
 8866            if not tag:
 8867                tag = "barcode"
 8868
 8869            # VCF infos tags
 8870            vcf_infos_tags = {
 8871                tag: "barcode calculation (VaRank)",
 8872            }
 8873
 8874            # Prefix
 8875            prefix = self.get_explode_infos_prefix()
 8876
 8877            # Field
 8878            barcode_infos = prefix + tag
 8879
 8880            # Variants table
 8881            table_variants = self.get_table_variants()
 8882
 8883            # Header
 8884            vcf_reader = self.get_header()
 8885
 8886            # Create variant id
 8887            variant_id_column = self.get_variant_id_column()
 8888            added_columns = [variant_id_column]
 8889
 8890            # variant_id, FORMAT and samples
 8891            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8892                self.get_header_sample_list()
 8893            )
 8894
 8895            # Create dataframe
 8896            dataframe_barcode = self.get_query_to_df(
 8897                f""" SELECT {samples_fields} FROM {table_variants} """
 8898            )
 8899
 8900            # Create barcode column
 8901            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8902                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8903            )
 8904
 8905            # Add barcode to header
 8906            vcf_reader.infos[tag] = vcf.parser._Info(
 8907                tag,
 8908                ".",
 8909                "String",
 8910                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8911                "howard calculation",
 8912                "0",
 8913                self.code_type_map.get("String"),
 8914            )
 8915
 8916            # Update
 8917            sql_update = f"""
 8918                UPDATE {table_variants}
 8919                SET "INFO" = 
 8920                    concat(
 8921                        CASE
 8922                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8923                            THEN ''
 8924                            ELSE concat("INFO", ';')
 8925                        END,
 8926                        CASE
 8927                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8928                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8929                            THEN concat(
 8930                                    '{tag}=',
 8931                                    dataframe_barcode."{barcode_infos}"
 8932                                )
 8933                            ELSE ''
 8934                        END
 8935                    )
 8936                FROM dataframe_barcode
 8937                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8938            """
 8939            self.conn.execute(sql_update)
 8940
 8941            # Remove added columns
 8942            for added_column in added_columns:
 8943                self.drop_column(column=added_column)
 8944
 8945            # Delete dataframe
 8946            del dataframe_barcode
 8947            gc.collect()
 8948
 8949    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8950        """
 8951        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8952        and updates the INFO field in the file with the calculated barcode values.
 8953
 8954        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8955        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8956        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8957        :type tag: str (optional)
 8958        """
 8959
 8960        # if FORMAT and samples
 8961        if (
 8962            "FORMAT" in self.get_header_columns_as_list()
 8963            and self.get_header_sample_list()
 8964        ):
 8965
 8966            # barcode annotation field
 8967            if not tag:
 8968                tag = "BCF"
 8969
 8970            # VCF infos tags
 8971            vcf_infos_tags = {
 8972                tag: "barcode family calculation",
 8973                f"{tag}S": "barcode family samples",
 8974            }
 8975
 8976            # Param
 8977            param = self.get_param()
 8978            log.debug(f"param={param}")
 8979
 8980            # Prefix
 8981            prefix = self.get_explode_infos_prefix()
 8982
 8983            # PED param
 8984            ped = (
 8985                param.get("calculation", {})
 8986                .get("calculations", {})
 8987                .get("BARCODEFAMILY", {})
 8988                .get("family_pedigree", None)
 8989            )
 8990            log.debug(f"ped={ped}")
 8991
 8992            # Load PED
 8993            if ped:
 8994
 8995                # Pedigree is a file
 8996                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8997                    log.debug("Pedigree is file")
 8998                    with open(full_path(ped)) as ped:
 8999                        ped = json.load(ped)
 9000
 9001                # Pedigree is a string
 9002                elif isinstance(ped, str):
 9003                    log.debug("Pedigree is str")
 9004                    try:
 9005                        ped = json.loads(ped)
 9006                        log.debug("Pedigree is json str")
 9007                    except ValueError as e:
 9008                        ped_samples = ped.split(",")
 9009                        ped = {}
 9010                        for ped_sample in ped_samples:
 9011                            ped[ped_sample] = ped_sample
 9012
 9013                # Pedigree is a dict
 9014                elif isinstance(ped, dict):
 9015                    log.debug("Pedigree is dict")
 9016
 9017                # Pedigree is not well formatted
 9018                else:
 9019                    msg_error = "Pedigree not well formatted"
 9020                    log.error(msg_error)
 9021                    raise ValueError(msg_error)
 9022
 9023                # Construct list
 9024                ped_samples = list(ped.values())
 9025
 9026            else:
 9027                log.debug("Pedigree not defined. Take all samples")
 9028                ped_samples = self.get_header_sample_list()
 9029                ped = {}
 9030                for ped_sample in ped_samples:
 9031                    ped[ped_sample] = ped_sample
 9032
 9033            # Check pedigree
 9034            if not ped or len(ped) == 0:
 9035                msg_error = f"Error in pedigree: samples {ped_samples}"
 9036                log.error(msg_error)
 9037                raise ValueError(msg_error)
 9038
 9039            # Log
 9040            log.info(
 9041                "Calculation 'BARCODEFAMILY' - Samples: "
 9042                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9043            )
 9044            log.debug(f"ped_samples={ped_samples}")
 9045
 9046            # Field
 9047            barcode_infos = prefix + tag
 9048
 9049            # Variants table
 9050            table_variants = self.get_table_variants()
 9051
 9052            # Header
 9053            vcf_reader = self.get_header()
 9054
 9055            # Create variant id
 9056            variant_id_column = self.get_variant_id_column()
 9057            added_columns = [variant_id_column]
 9058
 9059            # variant_id, FORMAT and samples
 9060            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9061                ped_samples
 9062            )
 9063
 9064            # Create dataframe
 9065            dataframe_barcode = self.get_query_to_df(
 9066                f""" SELECT {samples_fields} FROM {table_variants} """
 9067            )
 9068
 9069            # Create barcode column
 9070            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9071                lambda row: barcode(row, samples=ped_samples), axis=1
 9072            )
 9073
 9074            # Add barcode family to header
 9075            # Add vaf_normalization to header
 9076            vcf_reader.formats[tag] = vcf.parser._Format(
 9077                id=tag,
 9078                num=".",
 9079                type="String",
 9080                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9081                type_code=self.code_type_map.get("String"),
 9082            )
 9083            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9084                id=f"{tag}S",
 9085                num=".",
 9086                type="String",
 9087                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9088                type_code=self.code_type_map.get("String"),
 9089            )
 9090
 9091            # Update
 9092            # for sample in ped_samples:
 9093            sql_update_set = []
 9094            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9095                if sample in ped_samples:
 9096                    value = f'dataframe_barcode."{barcode_infos}"'
 9097                    value_samples = "'" + ",".join(ped_samples) + "'"
 9098                elif sample == "FORMAT":
 9099                    value = f"'{tag}'"
 9100                    value_samples = f"'{tag}S'"
 9101                else:
 9102                    value = "'.'"
 9103                    value_samples = "'.'"
 9104                format_regex = r"[a-zA-Z0-9\s]"
 9105                sql_update_set.append(
 9106                    f"""
 9107                        "{sample}" = 
 9108                        concat(
 9109                            CASE
 9110                                WHEN {table_variants}."{sample}" = './.'
 9111                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9112                                ELSE {table_variants}."{sample}"
 9113                            END,
 9114                            ':',
 9115                            {value},
 9116                            ':',
 9117                            {value_samples}
 9118                        )
 9119                    """
 9120                )
 9121
 9122            sql_update_set_join = ", ".join(sql_update_set)
 9123            sql_update = f"""
 9124                UPDATE {table_variants}
 9125                SET {sql_update_set_join}
 9126                FROM dataframe_barcode
 9127                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9128            """
 9129            self.conn.execute(sql_update)
 9130
 9131            # Remove added columns
 9132            for added_column in added_columns:
 9133                self.drop_column(column=added_column)
 9134
 9135            # Delete dataframe
 9136            del dataframe_barcode
 9137            gc.collect()
 9138
 9139    def calculation_trio(self) -> None:
 9140        """
 9141        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9142        information to the INFO field of each variant.
 9143        """
 9144
 9145        # if FORMAT and samples
 9146        if (
 9147            "FORMAT" in self.get_header_columns_as_list()
 9148            and self.get_header_sample_list()
 9149        ):
 9150
 9151            # trio annotation field
 9152            trio_tag = "trio"
 9153
 9154            # VCF infos tags
 9155            vcf_infos_tags = {
 9156                "trio": "trio calculation",
 9157            }
 9158
 9159            # Param
 9160            param = self.get_param()
 9161
 9162            # Prefix
 9163            prefix = self.get_explode_infos_prefix()
 9164
 9165            # Trio param
 9166            trio_ped = (
 9167                param.get("calculation", {})
 9168                .get("calculations", {})
 9169                .get("TRIO", {})
 9170                .get("trio_pedigree", None)
 9171            )
 9172
 9173            # Load trio
 9174            if trio_ped:
 9175
 9176                # Trio pedigree is a file
 9177                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9178                    log.debug("TRIO pedigree is file")
 9179                    with open(full_path(trio_ped)) as trio_ped:
 9180                        trio_ped = json.load(trio_ped)
 9181
 9182                # Trio pedigree is a string
 9183                elif isinstance(trio_ped, str):
 9184                    log.debug("TRIO pedigree is str")
 9185                    try:
 9186                        trio_ped = json.loads(trio_ped)
 9187                        log.debug("TRIO pedigree is json str")
 9188                    except ValueError as e:
 9189                        trio_samples = trio_ped.split(",")
 9190                        if len(trio_samples) == 3:
 9191                            trio_ped = {
 9192                                "father": trio_samples[0],
 9193                                "mother": trio_samples[1],
 9194                                "child": trio_samples[2],
 9195                            }
 9196                            log.debug("TRIO pedigree is list str")
 9197                        else:
 9198                            msg_error = "TRIO pedigree not well formatted"
 9199                            log.error(msg_error)
 9200                            raise ValueError(msg_error)
 9201
 9202                # Trio pedigree is a dict
 9203                elif isinstance(trio_ped, dict):
 9204                    log.debug("TRIO pedigree is dict")
 9205
 9206                # Trio pedigree is not well formatted
 9207                else:
 9208                    msg_error = "TRIO pedigree not well formatted"
 9209                    log.error(msg_error)
 9210                    raise ValueError(msg_error)
 9211
 9212                # Construct trio list
 9213                trio_samples = [
 9214                    trio_ped.get("father", ""),
 9215                    trio_ped.get("mother", ""),
 9216                    trio_ped.get("child", ""),
 9217                ]
 9218
 9219            else:
 9220                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9221                samples_list = self.get_header_sample_list()
 9222                if len(samples_list) >= 3:
 9223                    trio_samples = self.get_header_sample_list()[0:3]
 9224                    trio_ped = {
 9225                        "father": trio_samples[0],
 9226                        "mother": trio_samples[1],
 9227                        "child": trio_samples[2],
 9228                    }
 9229                else:
 9230                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9231                    log.error(msg_error)
 9232                    raise ValueError(msg_error)
 9233
 9234            # Check trio pedigree
 9235            if not trio_ped or len(trio_ped) != 3:
 9236                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9237                log.error(msg_error)
 9238                raise ValueError(msg_error)
 9239
 9240            # Log
 9241            log.info(
 9242                f"Calculation 'TRIO' - Samples: "
 9243                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9244            )
 9245
 9246            # Field
 9247            trio_infos = prefix + trio_tag
 9248
 9249            # Variants table
 9250            table_variants = self.get_table_variants()
 9251
 9252            # Header
 9253            vcf_reader = self.get_header()
 9254
 9255            # Create variant id
 9256            variant_id_column = self.get_variant_id_column()
 9257            added_columns = [variant_id_column]
 9258
 9259            # variant_id, FORMAT and samples
 9260            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9261                self.get_header_sample_list()
 9262            )
 9263
 9264            # Create dataframe
 9265            dataframe_trio = self.get_query_to_df(
 9266                f""" SELECT {samples_fields} FROM {table_variants} """
 9267            )
 9268
 9269            # Create trio column
 9270            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9271                lambda row: trio(row, samples=trio_samples), axis=1
 9272            )
 9273
 9274            # Add trio to header
 9275            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9276                trio_tag,
 9277                ".",
 9278                "String",
 9279                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9280                "howard calculation",
 9281                "0",
 9282                self.code_type_map.get("String"),
 9283            )
 9284
 9285            # Update
 9286            sql_update = f"""
 9287                UPDATE {table_variants}
 9288                SET "INFO" = 
 9289                    concat(
 9290                        CASE
 9291                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9292                            THEN ''
 9293                            ELSE concat("INFO", ';')
 9294                        END,
 9295                        CASE
 9296                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9297                             AND dataframe_trio."{trio_infos}" NOT NULL
 9298                            THEN concat(
 9299                                    '{trio_tag}=',
 9300                                    dataframe_trio."{trio_infos}"
 9301                                )
 9302                            ELSE ''
 9303                        END
 9304                    )
 9305                FROM dataframe_trio
 9306                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9307            """
 9308            self.conn.execute(sql_update)
 9309
 9310            # Remove added columns
 9311            for added_column in added_columns:
 9312                self.drop_column(column=added_column)
 9313
 9314            # Delete dataframe
 9315            del dataframe_trio
 9316            gc.collect()
 9317
 9318    def calculation_vaf_normalization(self) -> None:
 9319        """
 9320        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9321        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9322        :return: The function does not return anything.
 9323        """
 9324
 9325        # if FORMAT and samples
 9326        if (
 9327            "FORMAT" in self.get_header_columns_as_list()
 9328            and self.get_header_sample_list()
 9329        ):
 9330
 9331            # vaf_normalization annotation field
 9332            vaf_normalization_tag = "VAF"
 9333
 9334            # VCF infos tags
 9335            vcf_infos_tags = {
 9336                "VAF": "VAF Variant Frequency",
 9337            }
 9338
 9339            # Prefix
 9340            prefix = self.get_explode_infos_prefix()
 9341
 9342            # Variants table
 9343            table_variants = self.get_table_variants()
 9344
 9345            # Header
 9346            vcf_reader = self.get_header()
 9347
 9348            # Do not calculate if VAF already exists
 9349            if "VAF" in vcf_reader.formats:
 9350                log.debug("VAF already on genotypes")
 9351                return
 9352
 9353            # Create variant id
 9354            variant_id_column = self.get_variant_id_column()
 9355            added_columns = [variant_id_column]
 9356
 9357            # variant_id, FORMAT and samples
 9358            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9359                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9360            )
 9361
 9362            # Create dataframe
 9363            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9364            log.debug(f"query={query}")
 9365            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9366
 9367            vaf_normalization_set = []
 9368
 9369            # for each sample vaf_normalization
 9370            for sample in self.get_header_sample_list():
 9371                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9372                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9373                )
 9374                vaf_normalization_set.append(
 9375                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9376                )
 9377
 9378            # Add VAF to FORMAT
 9379            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9380                "FORMAT"
 9381            ].apply(lambda x: str(x) + ":VAF")
 9382            vaf_normalization_set.append(
 9383                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9384            )
 9385
 9386            # Add vaf_normalization to header
 9387            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9388                id=vaf_normalization_tag,
 9389                num="1",
 9390                type="Float",
 9391                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9392                type_code=self.code_type_map.get("Float"),
 9393            )
 9394
 9395            # Create fields to add in INFO
 9396            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9397
 9398            # Update
 9399            sql_update = f"""
 9400                UPDATE {table_variants}
 9401                SET {sql_vaf_normalization_set}
 9402                FROM dataframe_vaf_normalization
 9403                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9404
 9405            """
 9406            self.conn.execute(sql_update)
 9407
 9408            # Remove added columns
 9409            for added_column in added_columns:
 9410                self.drop_column(column=added_column)
 9411
 9412            # Delete dataframe
 9413            del dataframe_vaf_normalization
 9414            gc.collect()
 9415
 9416    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9417        """
 9418        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9419        field in a VCF file and updates the INFO column of the variants table with the calculated
 9420        statistics.
 9421
 9422        :param info: The `info` parameter is a string that represents the type of information for which
 9423        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9424        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9425        maximum value, the mean, the median, defaults to VAF
 9426        :type info: str (optional)
 9427        """
 9428
 9429        # if FORMAT and samples
 9430        if (
 9431            "FORMAT" in self.get_header_columns_as_list()
 9432            and self.get_header_sample_list()
 9433        ):
 9434
 9435            # vaf_stats annotation field
 9436            vaf_stats_tag = info + "_stats"
 9437
 9438            # VCF infos tags
 9439            vcf_infos_tags = {
 9440                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9441                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9442                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9443                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9444                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9445                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9446                info
 9447                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9448            }
 9449
 9450            # Prefix
 9451            prefix = self.get_explode_infos_prefix()
 9452
 9453            # Field
 9454            vaf_stats_infos = prefix + vaf_stats_tag
 9455
 9456            # Variants table
 9457            table_variants = self.get_table_variants()
 9458
 9459            # Header
 9460            vcf_reader = self.get_header()
 9461
 9462            # Create variant id
 9463            variant_id_column = self.get_variant_id_column()
 9464            added_columns = [variant_id_column]
 9465
 9466            # variant_id, FORMAT and samples
 9467            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9468                self.get_header_sample_list()
 9469            )
 9470
 9471            # Create dataframe
 9472            dataframe_vaf_stats = self.get_query_to_df(
 9473                f""" SELECT {samples_fields} FROM {table_variants} """
 9474            )
 9475
 9476            # Create vaf_stats column
 9477            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9478                lambda row: genotype_stats(
 9479                    row, samples=self.get_header_sample_list(), info=info
 9480                ),
 9481                axis=1,
 9482            )
 9483
 9484            # List of vcf tags
 9485            sql_vaf_stats_fields = []
 9486
 9487            # Check all VAF stats infos
 9488            for stat in vcf_infos_tags:
 9489
 9490                # Extract stats
 9491                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9492                    lambda x: dict(x).get(stat, "")
 9493                )
 9494
 9495                # Add snpeff_hgvs to header
 9496                vcf_reader.infos[stat] = vcf.parser._Info(
 9497                    stat,
 9498                    ".",
 9499                    "String",
 9500                    vcf_infos_tags.get(stat, "genotype statistics"),
 9501                    "howard calculation",
 9502                    "0",
 9503                    self.code_type_map.get("String"),
 9504                )
 9505
 9506                if len(sql_vaf_stats_fields):
 9507                    sep = ";"
 9508                else:
 9509                    sep = ""
 9510
 9511                # Create fields to add in INFO
 9512                sql_vaf_stats_fields.append(
 9513                    f"""
 9514                        CASE
 9515                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9516                            THEN concat(
 9517                                    '{sep}{stat}=',
 9518                                    dataframe_vaf_stats."{stat}"
 9519                                )
 9520                            ELSE ''
 9521                        END
 9522                    """
 9523                )
 9524
 9525            # SQL set for update
 9526            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9527
 9528            # Update
 9529            sql_update = f"""
 9530                UPDATE {table_variants}
 9531                SET "INFO" = 
 9532                    concat(
 9533                        CASE
 9534                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9535                            THEN ''
 9536                            ELSE concat("INFO", ';')
 9537                        END,
 9538                        {sql_vaf_stats_fields_set}
 9539                    )
 9540                FROM dataframe_vaf_stats
 9541                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9542
 9543            """
 9544            self.conn.execute(sql_update)
 9545
 9546            # Remove added columns
 9547            for added_column in added_columns:
 9548                self.drop_column(column=added_column)
 9549
 9550            # Delete dataframe
 9551            del dataframe_vaf_stats
 9552            gc.collect()
 9553
 9554    def calculation_transcripts_annotation(
 9555        self, info_json: str = None, info_format: str = None
 9556    ) -> None:
 9557        """
 9558        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9559        field to it if transcripts are available.
 9560
 9561        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9562        is a string parameter that represents the information field to be used in the transcripts JSON.
 9563        It is used to specify the JSON format for the transcripts information. If no value is provided
 9564        when calling the method, it defaults to "
 9565        :type info_json: str
 9566        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9567        method is a string parameter that specifies the format of the information field to be used in
 9568        the transcripts JSON. It is used to define the format of the information field
 9569        :type info_format: str
 9570        """
 9571
 9572        # Create transcripts table
 9573        transcripts_table = self.create_transcript_view()
 9574
 9575        # Add info field
 9576        if transcripts_table:
 9577            self.transcript_view_to_variants(
 9578                transcripts_table=transcripts_table,
 9579                transcripts_info_field_json=info_json,
 9580                transcripts_info_field_format=info_format,
 9581            )
 9582        else:
 9583            log.info("No Transcripts to process. Check param.json file configuration")
 9584
 9585    def calculation_transcripts_prioritization(self) -> None:
 9586        """
 9587        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9588        prioritizes transcripts based on certain criteria.
 9589        """
 9590
 9591        # Create transcripts table
 9592        transcripts_table = self.create_transcript_view()
 9593
 9594        # Add info field
 9595        if transcripts_table:
 9596            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9597        else:
 9598            log.info("No Transcripts to process. Check param.json file configuration")
 9599
 9600    ###############
 9601    # Transcripts #
 9602    ###############
 9603
 9604    def transcripts_prioritization(
 9605        self, transcripts_table: str = None, param: dict = {}
 9606    ) -> bool:
 9607        """
 9608        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9609        and updates the variants table with the prioritized information.
 9610
 9611        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9612        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9613        This parameter is used to identify the table where the transcripts data is stored for the
 9614        prioritization process
 9615        :type transcripts_table: str
 9616        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9617        that contains various configuration settings for the prioritization process of transcripts. It
 9618        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9619        the prefix for prioritization fields, default profiles, and other
 9620        :type param: dict
 9621        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9622        transcripts prioritization process is successfully completed, and `False` if there are any
 9623        issues or if no profile is defined for transcripts prioritization.
 9624        """
 9625
 9626        log.debug("Start transcripts prioritization...")
 9627
 9628        # Param
 9629        if not param:
 9630            param = self.get_param()
 9631
 9632        # Variants table
 9633        table_variants = self.get_table_variants()
 9634        log.debug(f"transcripts_table={transcripts_table}")
 9635        # Transcripts table
 9636        if transcripts_table is None:
 9637            log.debug(f"transcripts_table={transcripts_table}")
 9638            transcripts_table = self.create_transcript_view(
 9639                transcripts_table="transcripts", param=param
 9640            )
 9641            log.debug(f"transcripts_table={transcripts_table}")
 9642        if transcripts_table is None:
 9643            msg_err = "No Transcripts table availalble"
 9644            log.error(msg_err)
 9645            raise ValueError(msg_err)
 9646
 9647        # Get transcripts columns
 9648        columns_as_list_query = f"""
 9649            DESCRIBE {transcripts_table}
 9650        """
 9651        columns_as_list = list(
 9652            self.get_query_to_df(columns_as_list_query)["column_name"]
 9653        )
 9654
 9655        # Create INFO if not exists
 9656        if "INFO" not in columns_as_list:
 9657            query_add_info = f"""
 9658                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9659            """
 9660            self.execute_query(query_add_info)
 9661
 9662        # Prioritization param and Force only PZ Score and Flag
 9663        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9664        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9665        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9666        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9667        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9668        pz_profile_default = (
 9669            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9670        )
 9671
 9672        # Exit if no profile
 9673        if pz_profile_default is None:
 9674            log.warning("No profile defined for transcripts prioritization")
 9675            return False
 9676
 9677        # Prioritization
 9678        prioritization_result = self.prioritization(
 9679            table=transcripts_table,
 9680            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9681        )
 9682        if not prioritization_result:
 9683            log.warning("Transcripts prioritization not processed")
 9684            return False
 9685
 9686        # Explode PZ fields
 9687        self.explode_infos(
 9688            table=transcripts_table,
 9689            fields=param.get("transcripts", {})
 9690            .get("prioritization", {})
 9691            .get("pzfields", []),
 9692        )
 9693
 9694        # Export Transcripts prioritization infos to variants table
 9695        query_update = f"""
 9696            WITH RankedTranscripts AS (
 9697                SELECT
 9698                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9699                    ROW_NUMBER() OVER (
 9700                        PARTITION BY "#CHROM", POS, REF, ALT
 9701                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9702                    ) AS rn
 9703                FROM
 9704                    {transcripts_table}
 9705            )
 9706            UPDATE {table_variants}
 9707                SET
 9708                INFO = CONCAT(CASE
 9709                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9710                            THEN ''
 9711                            ELSE concat("INFO", ';')
 9712                        END,
 9713                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9714                        )
 9715            FROM
 9716                RankedTranscripts
 9717            WHERE
 9718                rn = 1
 9719                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9720                AND variants."POS" = RankedTranscripts."POS"
 9721                AND variants."REF" = RankedTranscripts."REF"
 9722                AND variants."ALT" = RankedTranscripts."ALT"
 9723                
 9724        """
 9725        self.execute_query(query=query_update)
 9726
 9727        # Add PZ Transcript in header
 9728        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9729            pz_fields_transcripts,
 9730            ".",
 9731            "String",
 9732            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9733            "unknown",
 9734            "unknown",
 9735            code_type_map["String"],
 9736        )
 9737
 9738        # Return
 9739        return True
 9740
 9741    def create_transcript_view_from_columns_map(
 9742        self,
 9743        transcripts_table: str = "transcripts",
 9744        columns_maps: dict = {},
 9745        added_columns: list = [],
 9746        temporary_tables: list = None,
 9747        annotation_fields: list = None,
 9748    ) -> tuple[list, list, list]:
 9749        """
 9750        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9751        specified columns mapping for transcripts data.
 9752
 9753        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9754        the table where the transcripts data is stored or will be stored in the database. This table
 9755        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9756        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9757        :type transcripts_table: str (optional)
 9758        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9759        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9760        represents a mapping configuration for a specific set of columns. It typically includes details such
 9761        as the main transcript column and additional information columns
 9762        :type columns_maps: dict
 9763        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9764        function is a list that stores the additional columns that will be added to the view being created
 9765        based on the columns map provided. These columns are generated by exploding the transcript
 9766        information columns along with the main transcript column
 9767        :type added_columns: list
 9768        :param temporary_tables: The `temporary_tables` parameter in the
 9769        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9770        tables created during the process of creating a transcript view from a columns map. These temporary
 9771        tables are used to store intermediate results or transformations before the final view is generated
 9772        :type temporary_tables: list
 9773        :param annotation_fields: The `annotation_fields` parameter in the
 9774        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9775        for annotation in the query view creation process. These fields are extracted from the
 9776        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9777        :type annotation_fields: list
 9778        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9779        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9780        """
 9781
 9782        log.debug("Start transcrpts view creation from columns map...")
 9783
 9784        # "from_columns_map": [
 9785        #     {
 9786        #         "transcripts_column": "Ensembl_transcriptid",
 9787        #         "transcripts_infos_columns": [
 9788        #             "genename",
 9789        #             "Ensembl_geneid",
 9790        #             "LIST_S2_score",
 9791        #             "LIST_S2_pred",
 9792        #         ],
 9793        #     },
 9794        #     {
 9795        #         "transcripts_column": "Ensembl_transcriptid",
 9796        #         "transcripts_infos_columns": [
 9797        #             "genename",
 9798        #             "VARITY_R_score",
 9799        #             "Aloft_pred",
 9800        #         ],
 9801        #     },
 9802        # ],
 9803
 9804        # Init
 9805        if temporary_tables is None:
 9806            temporary_tables = []
 9807        if annotation_fields is None:
 9808            annotation_fields = []
 9809
 9810        # Variants table
 9811        table_variants = self.get_table_variants()
 9812
 9813        for columns_map in columns_maps:
 9814
 9815            # Transcript column
 9816            transcripts_column = columns_map.get("transcripts_column", None)
 9817
 9818            # Transcripts infos columns
 9819            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9820
 9821            if transcripts_column is not None:
 9822
 9823                # Explode
 9824                added_columns += self.explode_infos(
 9825                    fields=[transcripts_column] + transcripts_infos_columns
 9826                )
 9827
 9828                # View clauses
 9829                clause_select = []
 9830                for field in [transcripts_column] + transcripts_infos_columns:
 9831                    clause_select.append(
 9832                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9833                    )
 9834                    if field not in [transcripts_column]:
 9835                        annotation_fields.append(field)
 9836
 9837                # Querey View
 9838                query = f""" 
 9839                    SELECT
 9840                        "#CHROM", POS, REF, ALT, INFO,
 9841                        "{transcripts_column}" AS 'transcript',
 9842                        {", ".join(clause_select)}
 9843                    FROM (
 9844                        SELECT 
 9845                            "#CHROM", POS, REF, ALT, INFO,
 9846                            {", ".join(clause_select)}
 9847                        FROM {table_variants}
 9848                        )
 9849                    WHERE "{transcripts_column}" IS NOT NULL
 9850                """
 9851
 9852                # Create temporary table
 9853                temporary_table = transcripts_table + "".join(
 9854                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9855                )
 9856
 9857                # Temporary_tables
 9858                temporary_tables.append(temporary_table)
 9859                query_view = f"""
 9860                    CREATE TEMPORARY TABLE {temporary_table}
 9861                    AS ({query})
 9862                """
 9863                self.execute_query(query=query_view)
 9864
 9865        return added_columns, temporary_tables, annotation_fields
 9866
 9867    def create_transcript_view_from_column_format(
 9868        self,
 9869        transcripts_table: str = "transcripts",
 9870        column_formats: dict = {},
 9871        temporary_tables: list = None,
 9872        annotation_fields: list = None,
 9873    ) -> tuple[list, list, list]:
 9874        """
 9875        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9876        specified column formats, adds additional columns and annotation fields, and returns the list of
 9877        temporary tables and annotation fields.
 9878
 9879        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9880        the table containing the transcripts data. This table will be used as the base table for creating
 9881        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9882        different table name if needed, defaults to transcripts
 9883        :type transcripts_table: str (optional)
 9884        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9885        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9886        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9887        the provided code snippet:
 9888        :type column_formats: dict
 9889        :param temporary_tables: The `temporary_tables` parameter in the
 9890        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9891        views created during the process of creating a transcript view from a column format. These temporary
 9892        views are used to manipulate and extract data before generating the final transcript view. It
 9893        :type temporary_tables: list
 9894        :param annotation_fields: The `annotation_fields` parameter in the
 9895        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9896        that are extracted from the temporary views created during the process. These annotation fields are
 9897        obtained by querying the temporary views and extracting the column names excluding specific columns
 9898        like `#CH
 9899        :type annotation_fields: list
 9900        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9901        `temporary_tables` and `annotation_fields`.
 9902        """
 9903
 9904        log.debug("Start transcrpts view creation from column format...")
 9905
 9906        #  "from_column_format": [
 9907        #     {
 9908        #         "transcripts_column": "ANN",
 9909        #         "transcripts_infos_column": "Feature_ID",
 9910        #     }
 9911        # ],
 9912
 9913        # Init
 9914        if temporary_tables is None:
 9915            temporary_tables = []
 9916        if annotation_fields is None:
 9917            annotation_fields = []
 9918
 9919        for column_format in column_formats:
 9920
 9921            # annotation field and transcript annotation field
 9922            annotation_field = column_format.get("transcripts_column", "ANN")
 9923            transcript_annotation = column_format.get(
 9924                "transcripts_infos_column", "Feature_ID"
 9925            )
 9926
 9927            # Temporary View name
 9928            temporary_view_name = transcripts_table + "".join(
 9929                random.choices(string.ascii_uppercase + string.digits, k=10)
 9930            )
 9931
 9932            # Create temporary view name
 9933            temporary_view_name = self.annotation_format_to_table(
 9934                uniquify=True,
 9935                annotation_field=annotation_field,
 9936                view_name=temporary_view_name,
 9937                annotation_id=transcript_annotation,
 9938            )
 9939
 9940            # Annotation fields
 9941            if temporary_view_name:
 9942                query_annotation_fields = f"""
 9943                    SELECT *
 9944                    FROM (
 9945                        DESCRIBE SELECT *
 9946                        FROM {temporary_view_name}
 9947                        )
 9948                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9949                """
 9950                df_annotation_fields = self.get_query_to_df(
 9951                    query=query_annotation_fields
 9952                )
 9953
 9954                # Add temporary view and annotation fields
 9955                temporary_tables.append(temporary_view_name)
 9956                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9957
 9958        return temporary_tables, annotation_fields
 9959
 9960    def create_transcript_view(
 9961        self,
 9962        transcripts_table: str = None,
 9963        transcripts_table_drop: bool = True,
 9964        param: dict = {},
 9965    ) -> str:
 9966        """
 9967        The `create_transcript_view` function generates a transcript view by processing data from a
 9968        specified table based on provided parameters and structural information.
 9969
 9970        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9971        is used to specify the name of the table that will store the final transcript view data. If a table
 9972        name is not provided, the function will create a new table to store the transcript view data, and by
 9973        default,, defaults to transcripts
 9974        :type transcripts_table: str (optional)
 9975        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9976        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9977        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9978        the function will drop the existing transcripts table if it exists, defaults to True
 9979        :type transcripts_table_drop: bool (optional)
 9980        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9981        contains information needed to create a transcript view. It includes details such as the structure
 9982        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9983        the view. This parameter allows for flexibility and customization
 9984        :type param: dict
 9985        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9986        created or modified during the execution of the function.
 9987        """
 9988
 9989        log.debug("Start transcripts view creation...")
 9990
 9991        # Default
 9992        transcripts_table_default = "transcripts"
 9993
 9994        # Param
 9995        if not param:
 9996            param = self.get_param()
 9997
 9998        # Struct
 9999        struct = param.get("transcripts", {}).get("struct", None)
10000
10001        if struct:
10002
10003            # Transcripts table
10004            if transcripts_table is None:
10005                transcripts_table = param.get("transcripts", {}).get(
10006                    "table", transcripts_table_default
10007                )
10008
10009            # added_columns
10010            added_columns = []
10011
10012            # Temporary tables
10013            temporary_tables = []
10014
10015            # Annotation fields
10016            annotation_fields = []
10017
10018            # from columns map
10019            columns_maps = struct.get("from_columns_map", [])
10020            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10021                self.create_transcript_view_from_columns_map(
10022                    transcripts_table=transcripts_table,
10023                    columns_maps=columns_maps,
10024                    added_columns=added_columns,
10025                    temporary_tables=temporary_tables,
10026                    annotation_fields=annotation_fields,
10027                )
10028            )
10029            added_columns += added_columns_tmp
10030            temporary_tables += temporary_tables_tmp
10031            annotation_fields += annotation_fields_tmp
10032
10033            # from column format
10034            column_formats = struct.get("from_column_format", [])
10035            temporary_tables_tmp, annotation_fields_tmp = (
10036                self.create_transcript_view_from_column_format(
10037                    transcripts_table=transcripts_table,
10038                    column_formats=column_formats,
10039                    temporary_tables=temporary_tables,
10040                    annotation_fields=annotation_fields,
10041                )
10042            )
10043            temporary_tables += temporary_tables_tmp
10044            annotation_fields += annotation_fields_tmp
10045
10046            # Merge temporary tables query
10047            query_merge = ""
10048            for temporary_table in temporary_tables:
10049
10050                # First temporary table
10051                if not query_merge:
10052                    query_merge = f"""
10053                        SELECT * FROM {temporary_table}
10054                    """
10055                # other temporary table (using UNION)
10056                else:
10057                    query_merge += f"""
10058                        UNION BY NAME SELECT * FROM {temporary_table}
10059                    """
10060
10061            # Merge on transcript
10062            query_merge_on_transcripts_annotation_fields = []
10063            # Aggregate all annotations fields
10064            for annotation_field in set(annotation_fields):
10065                query_merge_on_transcripts_annotation_fields.append(
10066                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
10067                )
10068            # Query for transcripts view
10069            query_merge_on_transcripts = f"""
10070                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
10071                FROM ({query_merge})
10072                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
10073            """
10074
10075            # Drop transcript view is necessary
10076            if transcripts_table_drop:
10077                query_drop = f"""
10078                    DROP TABLE IF EXISTS {transcripts_table};
10079                """
10080                self.execute_query(query=query_drop)
10081
10082            # Merge and create transcript view
10083            query_create_view = f"""
10084                CREATE TABLE IF NOT EXISTS {transcripts_table}
10085                AS {query_merge_on_transcripts}
10086            """
10087            self.execute_query(query=query_create_view)
10088
10089            # Remove added columns
10090            for added_column in added_columns:
10091                self.drop_column(column=added_column)
10092
10093        else:
10094
10095            transcripts_table = None
10096
10097        return transcripts_table
10098
10099    def annotation_format_to_table(
10100        self,
10101        uniquify: bool = True,
10102        annotation_field: str = "ANN",
10103        annotation_id: str = "Feature_ID",
10104        view_name: str = "transcripts",
10105    ) -> str:
10106        """
10107        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
10108        table format.
10109
10110        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
10111        values in the output or not. If set to `True`, the function will make sure that the output values
10112        are unique, defaults to True
10113        :type uniquify: bool (optional)
10114        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10115        contains the annotation information for each variant. This field is used to extract the annotation
10116        details for further processing in the function, defaults to ANN
10117        :type annotation_field: str (optional)
10118        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10119        used to specify the identifier for the annotation feature. This identifier will be used as a column
10120        name in the resulting table or view that is created based on the annotation data. It helps in
10121        uniquely identifying each annotation entry in the, defaults to Feature_ID
10122        :type annotation_id: str (optional)
10123        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10124        specify the name of the temporary table that will be created to store the transformed annotation
10125        data. This table will hold the extracted information from the annotation field in a structured
10126        format for further processing or analysis, defaults to transcripts
10127        :type view_name: str (optional)
10128        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10129        is stored in the variable `view_name`.
10130        """
10131
10132        # Annotation field
10133        annotation_format = "annotation_explode"
10134
10135        # Transcript annotation
10136        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10137
10138        # Prefix
10139        prefix = self.get_explode_infos_prefix()
10140        if prefix:
10141            prefix = "INFO/"
10142
10143        # Annotation fields
10144        annotation_infos = prefix + annotation_field
10145        annotation_format_infos = prefix + annotation_format
10146
10147        # Variants table
10148        table_variants = self.get_table_variants()
10149
10150        # Header
10151        vcf_reader = self.get_header()
10152
10153        # Add columns
10154        added_columns = []
10155
10156        # Explode HGVS field in column
10157        added_columns += self.explode_infos(fields=[annotation_field])
10158
10159        if annotation_field in vcf_reader.infos:
10160
10161            # Extract ANN header
10162            ann_description = vcf_reader.infos[annotation_field].desc
10163            pattern = r"'(.+?)'"
10164            match = re.search(pattern, ann_description)
10165            if match:
10166                ann_header_match = match.group(1).split(" | ")
10167                ann_header = []
10168                ann_header_desc = {}
10169                for i in range(len(ann_header_match)):
10170                    ann_header_info = "".join(
10171                        char for char in ann_header_match[i] if char.isalnum()
10172                    )
10173                    ann_header.append(ann_header_info)
10174                    ann_header_desc[ann_header_info] = ann_header_match[i]
10175                if not ann_header_desc:
10176                    raise ValueError("Invalid header description format")
10177            else:
10178                raise ValueError("Invalid header description format")
10179
10180            # Create variant id
10181            variant_id_column = self.get_variant_id_column()
10182            added_columns += [variant_id_column]
10183
10184            # Create dataframe
10185            dataframe_annotation_format = self.get_query_to_df(
10186                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10187            )
10188
10189            # Create annotation columns
10190            dataframe_annotation_format[
10191                annotation_format_infos
10192            ] = dataframe_annotation_format[annotation_infos].apply(
10193                lambda x: explode_annotation_format(
10194                    annotation=str(x),
10195                    uniquify=uniquify,
10196                    output_format="JSON",
10197                    prefix="",
10198                    header=list(ann_header_desc.values()),
10199                )
10200            )
10201
10202            # Find keys
10203            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10204            df_keys = self.get_query_to_df(query=query_json)
10205
10206            # Check keys
10207            query_json_key = []
10208            for _, row in df_keys.iterrows():
10209
10210                # Key
10211                key = row.iloc[0]
10212
10213                # key_clean
10214                key_clean = "".join(char for char in key if char.isalnum())
10215
10216                # Type
10217                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10218
10219                # Get DataFrame from query
10220                df_json_type = self.get_query_to_df(query=query_json_type)
10221
10222                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10223                with pd.option_context("future.no_silent_downcasting", True):
10224                    df_json_type.fillna(value="", inplace=True)
10225                    replace_dict = {None: np.nan, "": np.nan}
10226                    df_json_type.replace(replace_dict, inplace=True)
10227                    df_json_type.dropna(inplace=True)
10228
10229                # Detect column type
10230                column_type = detect_column_type(df_json_type[key_clean])
10231
10232                # Append
10233                query_json_key.append(
10234                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10235                )
10236
10237            # Create view
10238            query_view = f"""
10239                CREATE TEMPORARY TABLE {view_name}
10240                AS (
10241                    SELECT *, {annotation_id} AS 'transcript'
10242                    FROM (
10243                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10244                        FROM dataframe_annotation_format
10245                        )
10246                    );
10247            """
10248            self.execute_query(query=query_view)
10249
10250        else:
10251
10252            # Return None
10253            view_name = None
10254
10255        # Remove added columns
10256        for added_column in added_columns:
10257            self.drop_column(column=added_column)
10258
10259        return view_name
10260
10261    def transcript_view_to_variants(
10262        self,
10263        transcripts_table: str = None,
10264        transcripts_column_id: str = None,
10265        transcripts_info_json: str = None,
10266        transcripts_info_field_json: str = None,
10267        transcripts_info_format: str = None,
10268        transcripts_info_field_format: str = None,
10269        param: dict = {},
10270    ) -> bool:
10271        """
10272        The `transcript_view_to_variants` function updates a variants table with information from
10273        transcripts in JSON format.
10274
10275        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10276        table containing the transcripts data. If this parameter is not provided, the function will
10277        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10278        :type transcripts_table: str
10279        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10280        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10281        identifier is used to match transcripts with variants in the database
10282        :type transcripts_column_id: str
10283        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10284        of the column in the variants table where the transcripts information will be stored in JSON
10285        format. This parameter allows you to define the column in the variants table that will hold the
10286        JSON-formatted information about transcripts
10287        :type transcripts_info_json: str
10288        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10289        specify the field in the VCF header that will contain information about transcripts in JSON
10290        format. This field will be added to the VCF header as an INFO field with the specified name
10291        :type transcripts_info_field_json: str
10292        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10293        format of the information about transcripts that will be stored in the variants table. This
10294        format can be used to define how the transcript information will be structured or displayed
10295        within the variants table
10296        :type transcripts_info_format: str
10297        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10298        specify the field in the VCF header that will contain information about transcripts in a
10299        specific format. This field will be added to the VCF header as an INFO field with the specified
10300        name
10301        :type transcripts_info_field_format: str
10302        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10303        that contains various configuration settings related to transcripts. It is used to provide
10304        default values for certain parameters if they are not explicitly provided when calling the
10305        method. The `param` dictionary can be passed as an argument
10306        :type param: dict
10307        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10308        if the operation is successful and `False` if certain conditions are not met.
10309        """
10310
10311        msg_info_prefix = "Start transcripts view to variants annotations"
10312
10313        log.debug(f"{msg_info_prefix}...")
10314
10315        # Default
10316        transcripts_table_default = "transcripts"
10317        transcripts_column_id_default = "transcript"
10318        transcripts_info_json_default = None
10319        transcripts_info_format_default = None
10320        transcripts_info_field_json_default = None
10321        transcripts_info_field_format_default = None
10322
10323        # Param
10324        if not param:
10325            param = self.get_param()
10326
10327        # Transcripts table
10328        if transcripts_table is None:
10329            transcripts_table = param.get("transcripts", {}).get(
10330                "table", transcripts_table_default
10331            )
10332
10333        # Transcripts column ID
10334        if transcripts_column_id is None:
10335            transcripts_column_id = param.get("transcripts", {}).get(
10336                "column_id", transcripts_column_id_default
10337            )
10338
10339        # Transcripts info json
10340        if transcripts_info_json is None:
10341            transcripts_info_json = param.get("transcripts", {}).get(
10342                "transcripts_info_json", transcripts_info_json_default
10343            )
10344
10345        # Transcripts info field JSON
10346        if transcripts_info_field_json is None:
10347            transcripts_info_field_json = param.get("transcripts", {}).get(
10348                "transcripts_info_field_json", transcripts_info_field_json_default
10349            )
10350        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10351        #     transcripts_info_json = transcripts_info_field_json
10352
10353        # Transcripts info format
10354        if transcripts_info_format is None:
10355            transcripts_info_format = param.get("transcripts", {}).get(
10356                "transcripts_info_format", transcripts_info_format_default
10357            )
10358
10359        # Transcripts info field FORMAT
10360        if transcripts_info_field_format is None:
10361            transcripts_info_field_format = param.get("transcripts", {}).get(
10362                "transcripts_info_field_format", transcripts_info_field_format_default
10363            )
10364        # if (
10365        #     transcripts_info_field_format is not None
10366        #     and transcripts_info_format is None
10367        # ):
10368        #     transcripts_info_format = transcripts_info_field_format
10369
10370        # Variants table
10371        table_variants = self.get_table_variants()
10372
10373        # Check info columns param
10374        if (
10375            transcripts_info_json is None
10376            and transcripts_info_field_json is None
10377            and transcripts_info_format is None
10378            and transcripts_info_field_format is None
10379        ):
10380            return False
10381
10382        # Transcripts infos columns
10383        query_transcripts_infos_columns = f"""
10384            SELECT *
10385            FROM (
10386                DESCRIBE SELECT * FROM {transcripts_table}
10387                )
10388            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10389        """
10390        transcripts_infos_columns = list(
10391            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10392        )
10393
10394        # View results
10395        clause_select = []
10396        clause_to_json = []
10397        clause_to_format = []
10398        for field in transcripts_infos_columns:
10399            clause_select.append(
10400                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10401            )
10402            clause_to_json.append(f""" '{field}': "{field}" """)
10403            clause_to_format.append(f""" "{field}" """)
10404
10405        # Update
10406        update_set_json = []
10407        update_set_format = []
10408
10409        # VCF header
10410        vcf_reader = self.get_header()
10411
10412        # Transcripts to info column in JSON
10413        if transcripts_info_json is not None:
10414
10415            # Create column on variants table
10416            self.add_column(
10417                table_name=table_variants,
10418                column_name=transcripts_info_json,
10419                column_type="JSON",
10420                default_value=None,
10421                drop=False,
10422            )
10423
10424            # Add header
10425            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10426                transcripts_info_json,
10427                ".",
10428                "String",
10429                "Transcripts in JSON format",
10430                "unknwon",
10431                "unknwon",
10432                self.code_type_map["String"],
10433            )
10434
10435            # Add to update
10436            update_set_json.append(
10437                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10438            )
10439
10440        # Transcripts to info field in JSON
10441        if transcripts_info_field_json is not None:
10442
10443            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10444
10445            # Add to update
10446            update_set_json.append(
10447                f""" 
10448                    INFO = concat(
10449                            CASE
10450                                WHEN INFO NOT IN ('', '.')
10451                                THEN INFO
10452                                ELSE ''
10453                            END,
10454                            CASE
10455                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10456                                THEN concat(
10457                                    ';{transcripts_info_field_json}=',
10458                                    t.{transcripts_info_json}
10459                                )
10460                                ELSE ''
10461                            END
10462                            )
10463                """
10464            )
10465
10466            # Add header
10467            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10468                transcripts_info_field_json,
10469                ".",
10470                "String",
10471                "Transcripts in JSON format",
10472                "unknwon",
10473                "unknwon",
10474                self.code_type_map["String"],
10475            )
10476
10477        if update_set_json:
10478
10479            # Update query
10480            query_update = f"""
10481                UPDATE {table_variants}
10482                    SET {", ".join(update_set_json)}
10483                FROM
10484                (
10485                    SELECT
10486                        "#CHROM", POS, REF, ALT,
10487                            concat(
10488                            '{{',
10489                            string_agg(
10490                                '"' || "{transcripts_column_id}" || '":' ||
10491                                to_json(json_output)
10492                            ),
10493                            '}}'
10494                            )::JSON AS {transcripts_info_json}
10495                    FROM
10496                        (
10497                        SELECT
10498                            "#CHROM", POS, REF, ALT,
10499                            "{transcripts_column_id}",
10500                            to_json(
10501                                {{{",".join(clause_to_json)}}}
10502                            )::JSON AS json_output
10503                        FROM
10504                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10505                        WHERE "{transcripts_column_id}" IS NOT NULL
10506                        )
10507                    GROUP BY "#CHROM", POS, REF, ALT
10508                ) AS t
10509                WHERE {table_variants}."#CHROM" = t."#CHROM"
10510                    AND {table_variants}."POS" = t."POS"
10511                    AND {table_variants}."REF" = t."REF"
10512                    AND {table_variants}."ALT" = t."ALT"
10513            """
10514
10515            self.execute_query(query=query_update)
10516
10517        # Transcripts to info column in FORMAT
10518        if transcripts_info_format is not None:
10519
10520            # Create column on variants table
10521            self.add_column(
10522                table_name=table_variants,
10523                column_name=transcripts_info_format,
10524                column_type="VARCHAR",
10525                default_value=None,
10526                drop=False,
10527            )
10528
10529            # Add header
10530            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10531                transcripts_info_format,
10532                ".",
10533                "String",
10534                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10535                "unknwon",
10536                "unknwon",
10537                self.code_type_map["String"],
10538            )
10539
10540            # Add to update
10541            update_set_format.append(
10542                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10543            )
10544
10545        # Transcripts to info field in JSON
10546        if transcripts_info_field_format is not None:
10547
10548            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10549
10550            # Add to update
10551            update_set_format.append(
10552                f""" 
10553                    INFO = concat(
10554                            CASE
10555                                WHEN INFO NOT IN ('', '.')
10556                                THEN INFO
10557                                ELSE ''
10558                            END,
10559                            CASE
10560                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10561                                THEN concat(
10562                                    ';{transcripts_info_field_format}=',
10563                                    t.{transcripts_info_format}
10564                                )
10565                                ELSE ''
10566                            END
10567                            )
10568                """
10569            )
10570
10571            # Add header
10572            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10573                transcripts_info_field_format,
10574                ".",
10575                "String",
10576                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10577                "unknwon",
10578                "unknwon",
10579                self.code_type_map["String"],
10580            )
10581
10582        if update_set_format:
10583
10584            # Update query
10585            query_update = f"""
10586                UPDATE {table_variants}
10587                    SET {", ".join(update_set_format)}
10588                FROM
10589                (
10590                    SELECT
10591                        "#CHROM", POS, REF, ALT,
10592                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10593                    FROM 
10594                        (
10595                        SELECT
10596                            "#CHROM", POS, REF, ALT,
10597                            "{transcripts_column_id}",
10598                            concat(
10599                                "{transcripts_column_id}",
10600                                '|',
10601                                {", '|', ".join(clause_to_format)}
10602                            ) AS {transcripts_info_format}
10603                        FROM
10604                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10605                        )
10606                    GROUP BY "#CHROM", POS, REF, ALT
10607                ) AS t
10608                WHERE {table_variants}."#CHROM" = t."#CHROM"
10609                    AND {table_variants}."POS" = t."POS"
10610                    AND {table_variants}."REF" = t."REF"
10611                    AND {table_variants}."ALT" = t."ALT"
10612            """
10613
10614            self.execute_query(query=query_update)
10615
10616        return True
class Variants:
   34class Variants:
   35
   36    def __init__(
   37        self,
   38        conn=None,
   39        input: str = None,
   40        output: str = None,
   41        config: dict = {},
   42        param: dict = {},
   43        load: bool = False,
   44    ) -> None:
   45        """
   46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   47        header
   48
   49        :param conn: the connection to the database
   50        :param input: the input file
   51        :param output: the output file
   52        :param config: a dictionary containing the configuration of the model
   53        :param param: a dictionary containing the parameters of the model
   54        """
   55
   56        # Init variables
   57        self.init_variables()
   58
   59        # Input
   60        self.set_input(input)
   61
   62        # Config
   63        self.set_config(config)
   64
   65        # Param
   66        self.set_param(param)
   67
   68        # Output
   69        self.set_output(output)
   70
   71        # connexion
   72        self.set_connexion(conn)
   73
   74        # Header
   75        self.set_header()
   76
   77        # Samples
   78        self.set_samples()
   79
   80        # Load data
   81        if load:
   82            self.load_data()
   83
   84    def set_samples(self, samples: list = None) -> list:
   85        """
   86        The function `set_samples` sets the samples attribute of an object to a provided list or
   87        retrieves it from a parameter dictionary.
   88
   89        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   90        input and sets the `samples` attribute of the class to the provided list. If no samples are
   91        provided, it tries to get the samples from the class's parameters using the `get_param` method
   92        :type samples: list
   93        :return: The `samples` list is being returned.
   94        """
   95
   96        if not samples:
   97            samples = self.get_param().get("samples", {}).get("list", None)
   98
   99        self.samples = samples
  100
  101        return samples
  102
  103    def get_samples(self) -> list:
  104        """
  105        This function returns a list of samples.
  106        :return: The `get_samples` method is returning the `samples` attribute of the object.
  107        """
  108
  109        return self.samples
  110
  111    def get_samples_check(self) -> bool:
  112        """
  113        This function returns the value of the "check" key within the "samples" dictionary retrieved
  114        from the parameters.
  115        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  116        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  117        method. If the key "check" is not found, it will return `False`.
  118        """
  119
  120        return self.get_param().get("samples", {}).get("check", True)
  121
  122    def set_input(self, input: str = None) -> None:
  123        """
  124        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  125        attributes in the class accordingly.
  126
  127        :param input: The `set_input` method in the provided code snippet is used to set attributes
  128        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  129        :type input: str
  130        """
  131
  132        if input and not isinstance(input, str):
  133            try:
  134                self.input = input.name
  135            except:
  136                log.error(f"Input file '{input} in bad format")
  137                raise ValueError(f"Input file '{input} in bad format")
  138        else:
  139            self.input = input
  140
  141        # Input format
  142        if input:
  143            input_name, input_extension = os.path.splitext(self.input)
  144            self.input_name = input_name
  145            self.input_extension = input_extension
  146            self.input_format = self.input_extension.replace(".", "")
  147
  148    def set_config(self, config: dict) -> None:
  149        """
  150        The set_config function takes a config object and assigns it as the configuration object for the
  151        class.
  152
  153        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  154        contains configuration settings for the class. When you call the `set_config` function with a
  155        dictionary object as the argument, it will set that dictionary as the configuration object for
  156        the class
  157        :type config: dict
  158        """
  159
  160        self.config = config
  161
  162    def set_param(self, param: dict) -> None:
  163        """
  164        This function sets a parameter object for the class based on the input dictionary.
  165
  166        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  167        as the `param` attribute of the class instance
  168        :type param: dict
  169        """
  170
  171        self.param = param
  172
  173    def init_variables(self) -> None:
  174        """
  175        This function initializes the variables that will be used in the rest of the class
  176        """
  177
  178        self.prefix = "howard"
  179        self.table_variants = "variants"
  180        self.dataframe = None
  181
  182        self.comparison_map = {
  183            "gt": ">",
  184            "gte": ">=",
  185            "lt": "<",
  186            "lte": "<=",
  187            "equals": "=",
  188            "contains": "SIMILAR TO",
  189        }
  190
  191        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  192
  193        self.code_type_map_to_sql = {
  194            "Integer": "INTEGER",
  195            "String": "VARCHAR",
  196            "Float": "FLOAT",
  197            "Flag": "VARCHAR",
  198        }
  199
  200        self.index_additionnal_fields = []
  201
  202    def get_indexing(self) -> bool:
  203        """
  204        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  205        returns False.
  206        :return: The value of the indexing parameter.
  207        """
  208
  209        return self.get_param().get("indexing", False)
  210
  211    def get_connexion_config(self) -> dict:
  212        """
  213        The function `get_connexion_config` returns a dictionary containing the configuration for a
  214        connection, including the number of threads and memory limit.
  215        :return: a dictionary containing the configuration for the Connexion library.
  216        """
  217
  218        # config
  219        config = self.get_config()
  220
  221        # Connexion config
  222        connexion_config = {}
  223        threads = self.get_threads()
  224
  225        # Threads
  226        if threads:
  227            connexion_config["threads"] = threads
  228
  229        # Memory
  230        # if config.get("memory", None):
  231        #     connexion_config["memory_limit"] = config.get("memory")
  232        if self.get_memory():
  233            connexion_config["memory_limit"] = self.get_memory()
  234
  235        # Temporary directory
  236        if config.get("tmp", None):
  237            connexion_config["temp_directory"] = config.get("tmp")
  238
  239        # Access
  240        if config.get("access", None):
  241            access = config.get("access")
  242            if access in ["RO"]:
  243                access = "READ_ONLY"
  244            elif access in ["RW"]:
  245                access = "READ_WRITE"
  246            connexion_db = self.get_connexion_db()
  247            if connexion_db in ":memory:":
  248                access = "READ_WRITE"
  249            connexion_config["access_mode"] = access
  250
  251        return connexion_config
  252
  253    def get_duckdb_settings(self) -> dict:
  254        """
  255        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  256        string.
  257        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  258        """
  259
  260        # config
  261        config = self.get_config()
  262
  263        # duckdb settings
  264        duckdb_settings_dict = {}
  265        if config.get("duckdb_settings", None):
  266            duckdb_settings = config.get("duckdb_settings")
  267            duckdb_settings = full_path(duckdb_settings)
  268            # duckdb setting is a file
  269            if os.path.exists(duckdb_settings):
  270                with open(duckdb_settings) as json_file:
  271                    duckdb_settings_dict = yaml.safe_load(json_file)
  272            # duckdb settings is a string
  273            else:
  274                duckdb_settings_dict = json.loads(duckdb_settings)
  275
  276        return duckdb_settings_dict
  277
  278    def set_connexion_db(self) -> str:
  279        """
  280        The function `set_connexion_db` returns the appropriate database connection string based on the
  281        input format and connection type.
  282        :return: the value of the variable `connexion_db`.
  283        """
  284
  285        # Default connexion db
  286        default_connexion_db = ":memory:"
  287
  288        # Find connexion db
  289        if self.get_input_format() in ["db", "duckdb"]:
  290            connexion_db = self.get_input()
  291        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  292            connexion_db = default_connexion_db
  293        elif self.get_connexion_type() in ["tmpfile"]:
  294            tmp_name = tempfile.mkdtemp(
  295                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  296            )
  297            connexion_db = f"{tmp_name}/tmp.db"
  298        elif self.get_connexion_type() != "":
  299            connexion_db = self.get_connexion_type()
  300        else:
  301            connexion_db = default_connexion_db
  302
  303        # Set connexion db
  304        self.connexion_db = connexion_db
  305
  306        return connexion_db
  307
  308    def set_connexion(self, conn) -> None:
  309        """
  310        The function `set_connexion` creates a connection to a database, with options for different
  311        database formats and settings.
  312
  313        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  314        database. If a connection is not provided, a new connection to an in-memory database is created.
  315        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  316        sqlite
  317        """
  318
  319        # Connexion db
  320        connexion_db = self.set_connexion_db()
  321
  322        # Connexion config
  323        connexion_config = self.get_connexion_config()
  324
  325        # Connexion format
  326        connexion_format = self.get_config().get("connexion_format", "duckdb")
  327        # Set connexion format
  328        self.connexion_format = connexion_format
  329
  330        # Connexion
  331        if not conn:
  332            if connexion_format in ["duckdb"]:
  333                conn = duckdb.connect(connexion_db, config=connexion_config)
  334                # duckDB settings
  335                duckdb_settings = self.get_duckdb_settings()
  336                if duckdb_settings:
  337                    for setting in duckdb_settings:
  338                        setting_value = duckdb_settings.get(setting)
  339                        if isinstance(setting_value, str):
  340                            setting_value = f"'{setting_value}'"
  341                        conn.execute(f"PRAGMA {setting}={setting_value};")
  342            elif connexion_format in ["sqlite"]:
  343                conn = sqlite3.connect(connexion_db)
  344
  345        # Set connexion
  346        self.conn = conn
  347
  348        # Log
  349        log.debug(f"connexion_format: {connexion_format}")
  350        log.debug(f"connexion_db: {connexion_db}")
  351        log.debug(f"connexion config: {connexion_config}")
  352        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  353
  354    def set_output(self, output: str = None) -> None:
  355        """
  356        The `set_output` function in Python sets the output file based on the input or a specified key
  357        in the config file, extracting the output name, extension, and format.
  358
  359        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  360        the output file. If the config file has an 'output' key, the method sets the output to the value
  361        of that key. If no output is provided, it sets the output to `None`
  362        :type output: str
  363        """
  364
  365        if output and not isinstance(output, str):
  366            self.output = output.name
  367        else:
  368            self.output = output
  369
  370        # Output format
  371        if self.output:
  372            output_name, output_extension = os.path.splitext(self.output)
  373            self.output_name = output_name
  374            self.output_extension = output_extension
  375            self.output_format = self.output_extension.replace(".", "")
  376        else:
  377            self.output_name = None
  378            self.output_extension = None
  379            self.output_format = None
  380
  381    def set_header(self) -> None:
  382        """
  383        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  384        """
  385
  386        input_file = self.get_input()
  387        default_header_list = [
  388            "##fileformat=VCFv4.2",
  389            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  390        ]
  391
  392        # Full path
  393        input_file = full_path(input_file)
  394
  395        if input_file:
  396
  397            input_format = self.get_input_format()
  398            input_compressed = self.get_input_compressed()
  399            config = self.get_config()
  400            header_list = default_header_list
  401            if input_format in [
  402                "vcf",
  403                "hdr",
  404                "tsv",
  405                "csv",
  406                "psv",
  407                "parquet",
  408                "db",
  409                "duckdb",
  410            ]:
  411                # header provided in param
  412                if config.get("header_file", None):
  413                    with open(config.get("header_file"), "rt") as f:
  414                        header_list = self.read_vcf_header(f)
  415                # within a vcf file format (header within input file itsself)
  416                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  417                    # within a compressed vcf file format (.vcf.gz)
  418                    if input_compressed:
  419                        with bgzf.open(input_file, "rt") as f:
  420                            header_list = self.read_vcf_header(f)
  421                    # within an uncompressed vcf file format (.vcf)
  422                    else:
  423                        with open(input_file, "rt") as f:
  424                            header_list = self.read_vcf_header(f)
  425                # header provided in default external file .hdr
  426                elif os.path.exists((input_file + ".hdr")):
  427                    with open(input_file + ".hdr", "rt") as f:
  428                        header_list = self.read_vcf_header(f)
  429                else:
  430                    try:  # Try to get header info fields and file columns
  431
  432                        with tempfile.TemporaryDirectory() as tmpdir:
  433
  434                            # Create database
  435                            db_for_header = Database(database=input_file)
  436
  437                            # Get header columns for infos fields
  438                            db_header_from_columns = (
  439                                db_for_header.get_header_from_columns()
  440                            )
  441
  442                            # Get real columns in the file
  443                            db_header_columns = db_for_header.get_columns()
  444
  445                            # Write header file
  446                            header_file_tmp = os.path.join(tmpdir, "header")
  447                            f = open(header_file_tmp, "w")
  448                            vcf.Writer(f, db_header_from_columns)
  449                            f.close()
  450
  451                            # Replace #CHROM line with rel columns
  452                            header_list = db_for_header.read_header_file(
  453                                header_file=header_file_tmp
  454                            )
  455                            header_list[-1] = "\t".join(db_header_columns)
  456
  457                    except:
  458
  459                        log.warning(
  460                            f"No header for file {input_file}. Set as default VCF header"
  461                        )
  462                        header_list = default_header_list
  463
  464            else:  # try for unknown format ?
  465
  466                log.error(f"Input file format '{input_format}' not available")
  467                raise ValueError(f"Input file format '{input_format}' not available")
  468
  469            if not header_list:
  470                header_list = default_header_list
  471
  472            # header as list
  473            self.header_list = header_list
  474
  475            # header as VCF object
  476            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  477
  478        else:
  479
  480            self.header_list = None
  481            self.header_vcf = None
  482
  483    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  484        """
  485        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  486        DataFrame based on the connection format.
  487
  488        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  489        represents the SQL query you want to execute. This query will be used to fetch data from a
  490        database and convert it into a pandas DataFrame
  491        :type query: str
  492        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  493        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  494        function will only fetch up to that number of rows from the database query result. If no limit
  495        is specified,
  496        :type limit: int
  497        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  498        """
  499
  500        # Connexion format
  501        connexion_format = self.get_connexion_format()
  502
  503        # Limit in query
  504        if limit:
  505            pd.set_option("display.max_rows", limit)
  506            if connexion_format in ["duckdb"]:
  507                df = (
  508                    self.conn.execute(query)
  509                    .fetch_record_batch(limit)
  510                    .read_next_batch()
  511                    .to_pandas()
  512                )
  513            elif connexion_format in ["sqlite"]:
  514                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  515
  516        # Full query
  517        else:
  518            if connexion_format in ["duckdb"]:
  519                df = self.conn.execute(query).df()
  520            elif connexion_format in ["sqlite"]:
  521                df = pd.read_sql_query(query, self.conn)
  522
  523        return df
  524
  525    def get_overview(self) -> None:
  526        """
  527        The function prints the input, output, config, and dataframe of the current object
  528        """
  529        table_variants_from = self.get_table_variants(clause="from")
  530        sql_columns = self.get_header_columns_as_sql()
  531        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  532        df = self.get_query_to_df(sql_query_export)
  533        log.info(
  534            "Input:  "
  535            + str(self.get_input())
  536            + " ["
  537            + str(str(self.get_input_format()))
  538            + "]"
  539        )
  540        log.info(
  541            "Output: "
  542            + str(self.get_output())
  543            + " ["
  544            + str(str(self.get_output_format()))
  545            + "]"
  546        )
  547        log.info("Config: ")
  548        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  549            "\n"
  550        ):
  551            log.info("\t" + str(d))
  552        log.info("Param: ")
  553        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  554            "\n"
  555        ):
  556            log.info("\t" + str(d))
  557        log.info("Sample list: " + str(self.get_header_sample_list()))
  558        log.info("Dataframe: ")
  559        for d in str(df).split("\n"):
  560            log.info("\t" + str(d))
  561
  562        # garbage collector
  563        del df
  564        gc.collect()
  565
  566        return None
  567
  568    def get_stats(self) -> dict:
  569        """
  570        The `get_stats` function calculates and returns various statistics of the current object,
  571        including information about the input file, variants, samples, header fields, quality, and
  572        SNVs/InDels.
  573        :return: a dictionary containing various statistics of the current object. The dictionary has
  574        the following structure:
  575        """
  576
  577        # Log
  578        log.info(f"Stats Calculation...")
  579
  580        # table varaints
  581        table_variants_from = self.get_table_variants()
  582
  583        # stats dict
  584        stats = {"Infos": {}}
  585
  586        ### File
  587        input_file = self.get_input()
  588        stats["Infos"]["Input file"] = input_file
  589
  590        # Header
  591        header_infos = self.get_header().infos
  592        header_formats = self.get_header().formats
  593        header_infos_list = list(header_infos)
  594        header_formats_list = list(header_formats)
  595
  596        ### Variants
  597
  598        stats["Variants"] = {}
  599
  600        # Variants by chr
  601        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  602        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  603        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  604            by=["CHROM"], kind="quicksort"
  605        )
  606
  607        # Total number of variants
  608        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  609
  610        # Calculate percentage
  611        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  612            lambda x: (x / nb_of_variants)
  613        )
  614
  615        stats["Variants"]["Number of variants by chromosome"] = (
  616            nb_of_variants_by_chrom.to_dict(orient="index")
  617        )
  618
  619        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  620
  621        ### Samples
  622
  623        # Init
  624        samples = {}
  625        nb_of_samples = 0
  626
  627        # Check Samples
  628        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  629            log.debug(f"Check samples...")
  630            for sample in self.get_header_sample_list():
  631                sql_query_samples = f"""
  632                    SELECT  '{sample}' as sample,
  633                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  634                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  635                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  636                    FROM {table_variants_from}
  637                    WHERE (
  638                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  639                        AND
  640                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  641                      )
  642                    GROUP BY genotype
  643                    """
  644                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  645                sample_genotype_count = sql_query_genotype_df["count"].sum()
  646                if len(sql_query_genotype_df):
  647                    nb_of_samples += 1
  648                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  649                        sql_query_genotype_df.to_dict(orient="index")
  650                    )
  651
  652            stats["Samples"] = samples
  653            stats["Infos"]["Number of samples"] = nb_of_samples
  654
  655        # #
  656        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  657        #     stats["Infos"]["Number of samples"] = nb_of_samples
  658        # elif nb_of_samples:
  659        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  660
  661        ### INFO and FORMAT fields
  662        header_types_df = {}
  663        header_types_list = {
  664            "List of INFO fields": header_infos,
  665            "List of FORMAT fields": header_formats,
  666        }
  667        i = 0
  668        for header_type in header_types_list:
  669
  670            header_type_infos = header_types_list.get(header_type)
  671            header_infos_dict = {}
  672
  673            for info in header_type_infos:
  674
  675                i += 1
  676                header_infos_dict[i] = {}
  677
  678                # ID
  679                header_infos_dict[i]["id"] = info
  680
  681                # num
  682                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  683                if header_type_infos[info].num in genotype_map.keys():
  684                    header_infos_dict[i]["Number"] = genotype_map.get(
  685                        header_type_infos[info].num
  686                    )
  687                else:
  688                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  689
  690                # type
  691                if header_type_infos[info].type:
  692                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  693                else:
  694                    header_infos_dict[i]["Type"] = "."
  695
  696                # desc
  697                if header_type_infos[info].desc != None:
  698                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  699                else:
  700                    header_infos_dict[i]["Description"] = ""
  701
  702            if len(header_infos_dict):
  703                header_types_df[header_type] = pd.DataFrame.from_dict(
  704                    header_infos_dict, orient="index"
  705                ).to_dict(orient="index")
  706
  707        # Stats
  708        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  709        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  710        stats["Header"] = header_types_df
  711
  712        ### QUAL
  713        if "QUAL" in self.get_header_columns():
  714            sql_query_qual = f"""
  715                    SELECT
  716                        avg(CAST(QUAL AS INTEGER)) AS Average,
  717                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  718                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  719                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  720                        median(CAST(QUAL AS INTEGER)) AS Median,
  721                        variance(CAST(QUAL AS INTEGER)) AS Variance
  722                    FROM {table_variants_from}
  723                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  724                    """
  725
  726            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  727            stats["Quality"] = {"Stats": qual}
  728
  729        ### SNV and InDel
  730
  731        sql_query_snv = f"""
  732            
  733            SELECT Type, count FROM (
  734
  735                    SELECT
  736                        'Total' AS Type,
  737                        count(*) AS count
  738                    FROM {table_variants_from}
  739
  740                    UNION
  741
  742                    SELECT
  743                        'MNV' AS Type,
  744                        count(*) AS count
  745                    FROM {table_variants_from}
  746                    WHERE len(REF) > 1 AND len(ALT) > 1
  747                    AND len(REF) = len(ALT)
  748
  749                    UNION
  750
  751                    SELECT
  752                        'InDel' AS Type,
  753                        count(*) AS count
  754                    FROM {table_variants_from}
  755                    WHERE len(REF) > 1 OR len(ALT) > 1
  756                    AND len(REF) != len(ALT)
  757                    
  758                    UNION
  759
  760                    SELECT
  761                        'SNV' AS Type,
  762                        count(*) AS count
  763                    FROM {table_variants_from}
  764                    WHERE len(REF) = 1 AND len(ALT) = 1
  765
  766                )
  767
  768            ORDER BY count DESC
  769
  770                """
  771        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  772
  773        sql_query_snv_substitution = f"""
  774                SELECT
  775                    concat(REF, '>', ALT) AS 'Substitution',
  776                    count(*) AS count
  777                FROM {table_variants_from}
  778                WHERE len(REF) = 1 AND len(ALT) = 1
  779                GROUP BY REF, ALT
  780                ORDER BY count(*) DESC
  781                """
  782        snv_substitution = (
  783            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  784        )
  785        stats["Variants"]["Counts"] = snv_indel
  786        stats["Variants"]["Substitutions"] = snv_substitution
  787
  788        return stats
  789
  790    def stats_to_file(self, file: str = None) -> str:
  791        """
  792        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  793        into a JSON object, and writes the JSON object to the specified file.
  794
  795        :param file: The `file` parameter is a string that represents the file path where the JSON data
  796        will be written
  797        :type file: str
  798        :return: the name of the file that was written to.
  799        """
  800
  801        # Get stats
  802        stats = self.get_stats()
  803
  804        # Serializing json
  805        json_object = json.dumps(stats, indent=4)
  806
  807        # Writing to sample.json
  808        with open(file, "w") as outfile:
  809            outfile.write(json_object)
  810
  811        return file
  812
  813    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  814        """
  815        The `print_stats` function generates a markdown file and prints the statistics contained in a
  816        JSON file in a formatted manner.
  817
  818        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  819        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  820        provided, a temporary directory will be created and the stats will be saved in a file named
  821        "stats.md" within that
  822        :type output_file: str
  823        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  824        file where the statistics will be saved. If no value is provided, a temporary directory will be
  825        created and a default file name "stats.json" will be used
  826        :type json_file: str
  827        :return: The function `print_stats` does not return any value. It has a return type annotation
  828        of `None`.
  829        """
  830
  831        # Full path
  832        output_file = full_path(output_file)
  833        json_file = full_path(json_file)
  834
  835        with tempfile.TemporaryDirectory() as tmpdir:
  836
  837            # Files
  838            if not output_file:
  839                output_file = os.path.join(tmpdir, "stats.md")
  840            if not json_file:
  841                json_file = os.path.join(tmpdir, "stats.json")
  842
  843            # Create folders
  844            if not os.path.exists(os.path.dirname(output_file)):
  845                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  846            if not os.path.exists(os.path.dirname(json_file)):
  847                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  848
  849            # Create stats JSON file
  850            stats_file = self.stats_to_file(file=json_file)
  851
  852            # Print stats file
  853            with open(stats_file) as f:
  854                stats = yaml.safe_load(f)
  855
  856            # Output
  857            output_title = []
  858            output_index = []
  859            output = []
  860
  861            # Title
  862            output_title.append("# HOWARD Stats")
  863
  864            # Index
  865            output_index.append("## Index")
  866
  867            # Process sections
  868            for section in stats:
  869                infos = stats.get(section)
  870                section_link = "#" + section.lower().replace(" ", "-")
  871                output.append(f"## {section}")
  872                output_index.append(f"- [{section}]({section_link})")
  873
  874                if len(infos):
  875                    for info in infos:
  876                        try:
  877                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  878                            is_df = True
  879                        except:
  880                            try:
  881                                df = pd.DataFrame.from_dict(
  882                                    json.loads((infos.get(info))), orient="index"
  883                                )
  884                                is_df = True
  885                            except:
  886                                is_df = False
  887                        if is_df:
  888                            output.append(f"### {info}")
  889                            info_link = "#" + info.lower().replace(" ", "-")
  890                            output_index.append(f"   - [{info}]({info_link})")
  891                            output.append(f"{df.to_markdown(index=False)}")
  892                        else:
  893                            output.append(f"- {info}: {infos.get(info)}")
  894                else:
  895                    output.append(f"NA")
  896
  897            # Write stats in markdown file
  898            with open(output_file, "w") as fp:
  899                for item in output_title:
  900                    fp.write("%s\n" % item)
  901                for item in output_index:
  902                    fp.write("%s\n" % item)
  903                for item in output:
  904                    fp.write("%s\n" % item)
  905
  906            # Output stats in markdown
  907            print("")
  908            print("\n\n".join(output_title))
  909            print("")
  910            print("\n\n".join(output))
  911            print("")
  912
  913        return None
  914
  915    def get_input(self) -> str:
  916        """
  917        It returns the value of the input variable.
  918        :return: The input is being returned.
  919        """
  920        return self.input
  921
  922    def get_input_format(self, input_file: str = None) -> str:
  923        """
  924        This function returns the format of the input variable, either from the provided input file or
  925        by prompting for input.
  926
  927        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  928        represents the file path of the input file. If no `input_file` is provided when calling the
  929        method, it will default to `None`
  930        :type input_file: str
  931        :return: The format of the input variable is being returned.
  932        """
  933
  934        if not input_file:
  935            input_file = self.get_input()
  936        input_format = get_file_format(input_file)
  937        return input_format
  938
  939    def get_input_compressed(self, input_file: str = None) -> str:
  940        """
  941        The function `get_input_compressed` returns the format of the input variable after compressing
  942        it.
  943
  944        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  945        that represents the file path of the input file. If no `input_file` is provided when calling the
  946        method, it will default to `None` and the method will then call `self.get_input()` to
  947        :type input_file: str
  948        :return: The function `get_input_compressed` returns the compressed format of the input
  949        variable.
  950        """
  951
  952        if not input_file:
  953            input_file = self.get_input()
  954        input_compressed = get_file_compressed(input_file)
  955        return input_compressed
  956
  957    def get_output(self) -> str:
  958        """
  959        It returns the output of the neuron.
  960        :return: The output of the neural network.
  961        """
  962
  963        return self.output
  964
  965    def get_output_format(self, output_file: str = None) -> str:
  966        """
  967        The function `get_output_format` returns the format of the input variable or the output file if
  968        provided.
  969
  970        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  971        that represents the file path of the output file. If no `output_file` is provided when calling
  972        the method, it will default to the output obtained from the `get_output` method of the class
  973        instance. The
  974        :type output_file: str
  975        :return: The format of the input variable is being returned.
  976        """
  977
  978        if not output_file:
  979            output_file = self.get_output()
  980        output_format = get_file_format(output_file)
  981
  982        return output_format
  983
  984    def get_config(self) -> dict:
  985        """
  986        It returns the config
  987        :return: The config variable is being returned.
  988        """
  989        return self.config
  990
  991    def get_param(self) -> dict:
  992        """
  993        It returns the param
  994        :return: The param variable is being returned.
  995        """
  996        return self.param
  997
  998    def get_connexion_db(self) -> str:
  999        """
 1000        It returns the connexion_db attribute of the object
 1001        :return: The connexion_db is being returned.
 1002        """
 1003        return self.connexion_db
 1004
 1005    def get_prefix(self) -> str:
 1006        """
 1007        It returns the prefix of the object.
 1008        :return: The prefix is being returned.
 1009        """
 1010        return self.prefix
 1011
 1012    def get_table_variants(self, clause: str = "select") -> str:
 1013        """
 1014        This function returns the table_variants attribute of the object
 1015
 1016        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1017        defaults to select (optional)
 1018        :return: The table_variants attribute of the object.
 1019        """
 1020
 1021        # Access
 1022        access = self.get_config().get("access", None)
 1023
 1024        # Clauses "select", "where", "update"
 1025        if clause in ["select", "where", "update"]:
 1026            table_variants = self.table_variants
 1027        # Clause "from"
 1028        elif clause in ["from"]:
 1029            # For Read Only
 1030            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1031                input_file = self.get_input()
 1032                table_variants = f"'{input_file}' as variants"
 1033            # For Read Write
 1034            else:
 1035                table_variants = f"{self.table_variants} as variants"
 1036        else:
 1037            table_variants = self.table_variants
 1038        return table_variants
 1039
 1040    def get_tmp_dir(self) -> str:
 1041        """
 1042        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1043        parameters or a default path.
 1044        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1045        configuration, parameters, and a default value of "/tmp".
 1046        """
 1047
 1048        return get_tmp(
 1049            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1050        )
 1051
 1052    def get_connexion_type(self) -> str:
 1053        """
 1054        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1055
 1056        :return: The connexion type is being returned.
 1057        """
 1058        return self.get_config().get("connexion_type", "memory")
 1059
 1060    def get_connexion(self):
 1061        """
 1062        It returns the connection object
 1063
 1064        :return: The connection object.
 1065        """
 1066        return self.conn
 1067
 1068    def close_connexion(self) -> None:
 1069        """
 1070        This function closes the connection to the database.
 1071        :return: The connection is being closed.
 1072        """
 1073        return self.conn.close()
 1074
 1075    def get_header(self, type: str = "vcf"):
 1076        """
 1077        This function returns the header of the VCF file as a list of strings
 1078
 1079        :param type: the type of header you want to get, defaults to vcf (optional)
 1080        :return: The header of the vcf file.
 1081        """
 1082
 1083        if self.header_vcf:
 1084            if type == "vcf":
 1085                return self.header_vcf
 1086            elif type == "list":
 1087                return self.header_list
 1088        else:
 1089            if type == "vcf":
 1090                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1091                return header
 1092            elif type == "list":
 1093                return vcf_required
 1094
 1095    def get_header_length(self, file: str = None) -> int:
 1096        """
 1097        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1098        line.
 1099
 1100        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1101        header file. If this argument is provided, the function will read the header from the specified
 1102        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1103        :type file: str
 1104        :return: the length of the header list, excluding the #CHROM line.
 1105        """
 1106
 1107        if file:
 1108            return len(self.read_vcf_header_file(file=file)) - 1
 1109        elif self.get_header(type="list"):
 1110            return len(self.get_header(type="list")) - 1
 1111        else:
 1112            return 0
 1113
 1114    def get_header_columns(self) -> str:
 1115        """
 1116        This function returns the header list of a VCF
 1117
 1118        :return: The length of the header list.
 1119        """
 1120        if self.get_header():
 1121            return self.get_header(type="list")[-1]
 1122        else:
 1123            return ""
 1124
 1125    def get_header_columns_as_list(self) -> list:
 1126        """
 1127        This function returns the header list of a VCF
 1128
 1129        :return: The length of the header list.
 1130        """
 1131        if self.get_header():
 1132            return self.get_header_columns().strip().split("\t")
 1133        else:
 1134            return []
 1135
 1136    def get_header_columns_as_sql(self) -> str:
 1137        """
 1138        This function retruns header length (without #CHROM line)
 1139
 1140        :return: The length of the header list.
 1141        """
 1142        sql_column_list = []
 1143        for col in self.get_header_columns_as_list():
 1144            sql_column_list.append(f'"{col}"')
 1145        return ",".join(sql_column_list)
 1146
 1147    def get_header_sample_list(
 1148        self, check: bool = False, samples: list = None, samples_force: bool = False
 1149    ) -> list:
 1150        """
 1151        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1152        checking and filtering based on input parameters.
 1153
 1154        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1155        parameter that determines whether to check if the samples in the list are properly defined as
 1156        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1157        list is defined as a, defaults to False
 1158        :type check: bool (optional)
 1159        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1160        allows you to specify a subset of samples from the header. If you provide a list of sample
 1161        names, the function will check if each sample is defined in the header. If a sample is not found
 1162        in the
 1163        :type samples: list
 1164        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1165        a boolean parameter that determines whether to force the function to return the sample list
 1166        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1167        function will return the sample list without performing, defaults to False
 1168        :type samples_force: bool (optional)
 1169        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1170        parameters and conditions specified in the function.
 1171        """
 1172
 1173        # Init
 1174        samples_list = []
 1175
 1176        if samples is None:
 1177            samples_list = self.header_vcf.samples
 1178        else:
 1179            samples_checked = []
 1180            for sample in samples:
 1181                if sample in self.header_vcf.samples:
 1182                    samples_checked.append(sample)
 1183                else:
 1184                    log.warning(f"Sample '{sample}' not defined in header")
 1185            samples_list = samples_checked
 1186
 1187            # Force sample list without checking if is_genotype_column
 1188            if samples_force:
 1189                log.warning(f"Samples {samples_list} not checked if genotypes")
 1190                return samples_list
 1191
 1192        if check:
 1193            samples_checked = []
 1194            for sample in samples_list:
 1195                if self.is_genotype_column(column=sample):
 1196                    samples_checked.append(sample)
 1197                else:
 1198                    log.warning(
 1199                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1200                    )
 1201            samples_list = samples_checked
 1202
 1203        # Return samples list
 1204        return samples_list
 1205
 1206    def is_genotype_column(self, column: str = None) -> bool:
 1207        """
 1208        This function checks if a given column is a genotype column in a database.
 1209
 1210        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1211        represents the column name in a database table. This method checks if the specified column is a
 1212        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1213        method of
 1214        :type column: str
 1215        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1216        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1217        column name and returns the result. If the `column` parameter is None, it returns False.
 1218        """
 1219
 1220        if column is not None:
 1221            return Database(database=self.get_input()).is_genotype_column(column=column)
 1222        else:
 1223            return False
 1224
 1225    def get_verbose(self) -> bool:
 1226        """
 1227        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1228        exist
 1229
 1230        :return: The value of the key "verbose" in the config dictionary.
 1231        """
 1232        return self.get_config().get("verbose", False)
 1233
 1234    def get_connexion_format(self) -> str:
 1235        """
 1236        It returns the connexion format of the object.
 1237        :return: The connexion_format is being returned.
 1238        """
 1239        connexion_format = self.connexion_format
 1240        if connexion_format not in ["duckdb", "sqlite"]:
 1241            log.error(f"Unknown connexion format {connexion_format}")
 1242            raise ValueError(f"Unknown connexion format {connexion_format}")
 1243        else:
 1244            return connexion_format
 1245
 1246    def insert_file_to_table(
 1247        self,
 1248        file,
 1249        columns: str,
 1250        header_len: int = 0,
 1251        sep: str = "\t",
 1252        chunksize: int = 1000000,
 1253    ) -> None:
 1254        """
 1255        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1256        database format.
 1257
 1258        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1259        the path to the file on your system
 1260        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1261        should contain the names of the columns in the table where the data will be inserted. The column
 1262        names should be separated by commas within the string. For example, if you have columns named
 1263        "id", "name
 1264        :type columns: str
 1265        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1266        the number of lines to skip at the beginning of the file before reading the actual data. This
 1267        parameter allows you to skip any header information present in the file before processing the
 1268        data, defaults to 0
 1269        :type header_len: int (optional)
 1270        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1271        separator character that is used in the file being read. In this case, the default separator is
 1272        set to `\t`, which represents a tab character. You can change this parameter to a different
 1273        separator character if, defaults to \t
 1274        :type sep: str (optional)
 1275        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1276        when processing the file in chunks. In the provided code snippet, the default value for
 1277        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1278        to 1000000
 1279        :type chunksize: int (optional)
 1280        """
 1281
 1282        # Config
 1283        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1284        connexion_format = self.get_connexion_format()
 1285
 1286        log.debug("chunksize: " + str(chunksize))
 1287
 1288        if chunksize:
 1289            for chunk in pd.read_csv(
 1290                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1291            ):
 1292                if connexion_format in ["duckdb"]:
 1293                    sql_insert_into = (
 1294                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1295                    )
 1296                    self.conn.execute(sql_insert_into)
 1297                elif connexion_format in ["sqlite"]:
 1298                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1299
 1300    def load_data(
 1301        self,
 1302        input_file: str = None,
 1303        drop_variants_table: bool = False,
 1304        sample_size: int = 20480,
 1305    ) -> None:
 1306        """
 1307        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1308        table before loading the data and specify a sample size.
 1309
 1310        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1311        table
 1312        :type input_file: str
 1313        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1314        determines whether the variants table should be dropped before loading the data. If set to
 1315        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1316        not be dropped, defaults to False
 1317        :type drop_variants_table: bool (optional)
 1318        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1319        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1320        20480
 1321        :type sample_size: int (optional)
 1322        """
 1323
 1324        log.info("Loading...")
 1325
 1326        # change input file
 1327        if input_file:
 1328            self.set_input(input_file)
 1329            self.set_header()
 1330
 1331        # drop variants table
 1332        if drop_variants_table:
 1333            self.drop_variants_table()
 1334
 1335        # get table variants
 1336        table_variants = self.get_table_variants()
 1337
 1338        # Access
 1339        access = self.get_config().get("access", None)
 1340        log.debug(f"access: {access}")
 1341
 1342        # Input format and compress
 1343        input_format = self.get_input_format()
 1344        input_compressed = self.get_input_compressed()
 1345        log.debug(f"input_format: {input_format}")
 1346        log.debug(f"input_compressed: {input_compressed}")
 1347
 1348        # input_compressed_format
 1349        if input_compressed:
 1350            input_compressed_format = "gzip"
 1351        else:
 1352            input_compressed_format = "none"
 1353        log.debug(f"input_compressed_format: {input_compressed_format}")
 1354
 1355        # Connexion format
 1356        connexion_format = self.get_connexion_format()
 1357
 1358        # Sample size
 1359        if not sample_size:
 1360            sample_size = -1
 1361        log.debug(f"sample_size: {sample_size}")
 1362
 1363        # Load data
 1364        log.debug(f"Load Data from {input_format}")
 1365
 1366        # DuckDB connexion
 1367        if connexion_format in ["duckdb"]:
 1368
 1369            # Database already exists
 1370            if self.input_format in ["db", "duckdb"]:
 1371
 1372                if connexion_format in ["duckdb"]:
 1373                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1374                else:
 1375                    log.error(
 1376                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1377                    )
 1378                    raise ValueError(
 1379                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1380                    )
 1381
 1382            # Load from existing database format
 1383            else:
 1384
 1385                try:
 1386                    # Create Table or View
 1387                    database = Database(database=self.input)
 1388                    sql_from = database.get_sql_from(sample_size=sample_size)
 1389
 1390                    if access in ["RO"]:
 1391                        sql_load = (
 1392                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1393                        )
 1394                    else:
 1395                        sql_load = (
 1396                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1397                        )
 1398                    self.conn.execute(sql_load)
 1399
 1400                except:
 1401                    # Format not available
 1402                    log.error(f"Input file format '{self.input_format}' not available")
 1403                    raise ValueError(
 1404                        f"Input file format '{self.input_format}' not available"
 1405                    )
 1406
 1407        # SQLite connexion
 1408        elif connexion_format in ["sqlite"] and input_format in [
 1409            "vcf",
 1410            "tsv",
 1411            "csv",
 1412            "psv",
 1413        ]:
 1414
 1415            # Main structure
 1416            structure = {
 1417                "#CHROM": "VARCHAR",
 1418                "POS": "INTEGER",
 1419                "ID": "VARCHAR",
 1420                "REF": "VARCHAR",
 1421                "ALT": "VARCHAR",
 1422                "QUAL": "VARCHAR",
 1423                "FILTER": "VARCHAR",
 1424                "INFO": "VARCHAR",
 1425            }
 1426
 1427            # Strcuture with samples
 1428            structure_complete = structure
 1429            if self.get_header_sample_list():
 1430                structure["FORMAT"] = "VARCHAR"
 1431                for sample in self.get_header_sample_list():
 1432                    structure_complete[sample] = "VARCHAR"
 1433
 1434            # Columns list for create and insert
 1435            sql_create_table_columns = []
 1436            sql_create_table_columns_list = []
 1437            for column in structure_complete:
 1438                column_type = structure_complete[column]
 1439                sql_create_table_columns.append(
 1440                    f'"{column}" {column_type} default NULL'
 1441                )
 1442                sql_create_table_columns_list.append(f'"{column}"')
 1443
 1444            # Create database
 1445            log.debug(f"Create Table {table_variants}")
 1446            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1447            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1448            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1449            self.conn.execute(sql_create_table)
 1450
 1451            # chunksize define length of file chunk load file
 1452            chunksize = 100000
 1453
 1454            # delimiter
 1455            delimiter = file_format_delimiters.get(input_format, "\t")
 1456
 1457            # Load the input file
 1458            with open(self.input, "rt") as input_file:
 1459
 1460                # Use the appropriate file handler based on the input format
 1461                if input_compressed:
 1462                    input_file = bgzf.open(self.input, "rt")
 1463                if input_format in ["vcf"]:
 1464                    header_len = self.get_header_length()
 1465                else:
 1466                    header_len = 0
 1467
 1468                # Insert the file contents into a table
 1469                self.insert_file_to_table(
 1470                    input_file,
 1471                    columns=sql_create_table_columns_list_sql,
 1472                    header_len=header_len,
 1473                    sep=delimiter,
 1474                    chunksize=chunksize,
 1475                )
 1476
 1477        else:
 1478            log.error(
 1479                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1480            )
 1481            raise ValueError(
 1482                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1483            )
 1484
 1485        # Explode INFOS fields into table fields
 1486        if self.get_explode_infos():
 1487            self.explode_infos(
 1488                prefix=self.get_explode_infos_prefix(),
 1489                fields=self.get_explode_infos_fields(),
 1490                force=True,
 1491            )
 1492
 1493        # Create index after insertion
 1494        self.create_indexes()
 1495
 1496    def get_explode_infos(self) -> bool:
 1497        """
 1498        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1499        to False if it is not set.
 1500        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1501        value. If the parameter is not present, it will return False.
 1502        """
 1503
 1504        return self.get_param().get("explode", {}).get("explode_infos", False)
 1505
 1506    def get_explode_infos_fields(
 1507        self,
 1508        explode_infos_fields: str = None,
 1509        remove_fields_not_in_header: bool = False,
 1510    ) -> list:
 1511        """
 1512        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1513        the input parameter `explode_infos_fields`.
 1514
 1515        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1516        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1517        comma-separated list of field names to explode
 1518        :type explode_infos_fields: str
 1519        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1520        flag that determines whether to remove fields that are not present in the header. If it is set
 1521        to `True`, any field that is not in the header will be excluded from the list of exploded
 1522        information fields. If it is set to `, defaults to False
 1523        :type remove_fields_not_in_header: bool (optional)
 1524        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1525        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1526        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1527        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1528        splitting the string by commas.
 1529        """
 1530
 1531        # If no fields, get it in param
 1532        if not explode_infos_fields:
 1533            explode_infos_fields = (
 1534                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1535            )
 1536
 1537        # If no fields, defined as all fields in header using keyword
 1538        if not explode_infos_fields:
 1539            explode_infos_fields = "*"
 1540
 1541        # If fields list not empty
 1542        if explode_infos_fields:
 1543
 1544            # Input fields list
 1545            if isinstance(explode_infos_fields, str):
 1546                fields_input = explode_infos_fields.split(",")
 1547            elif isinstance(explode_infos_fields, list):
 1548                fields_input = explode_infos_fields
 1549            else:
 1550                fields_input = []
 1551
 1552            # Fields list without * keyword
 1553            fields_without_all = fields_input.copy()
 1554            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1555                fields_without_all.remove("*")
 1556
 1557            # Fields in header
 1558            fields_in_header = sorted(list(set(self.get_header().infos)))
 1559
 1560            # Construct list of fields
 1561            fields_output = []
 1562            for field in fields_input:
 1563
 1564                # Strip field
 1565                field = field.strip()
 1566
 1567                # format keyword * in regex
 1568                if field.upper() in ["*"]:
 1569                    field = ".*"
 1570
 1571                # Find all fields with pattern
 1572                r = re.compile(field)
 1573                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1574
 1575                # Remove fields input from search
 1576                if field in fields_search:
 1577                    fields_search = [field]
 1578                elif fields_search != [field]:
 1579                    fields_search = sorted(
 1580                        list(set(fields_search).difference(fields_input))
 1581                    )
 1582
 1583                # If field is not in header (avoid not well formatted header)
 1584                if not fields_search and not remove_fields_not_in_header:
 1585                    fields_search = [field]
 1586
 1587                # Add found fields
 1588                for new_field in fields_search:
 1589                    # Add field, if not already exists, and if it is in header (if asked)
 1590                    if (
 1591                        new_field not in fields_output
 1592                        and (
 1593                            not remove_fields_not_in_header
 1594                            or new_field in fields_in_header
 1595                        )
 1596                        and new_field not in [".*"]
 1597                    ):
 1598                        fields_output.append(new_field)
 1599
 1600            return fields_output
 1601
 1602        else:
 1603
 1604            return []
 1605
 1606    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1607        """
 1608        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1609        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1610        not provided.
 1611
 1612        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1613        prefix to be used for exploding or expanding information
 1614        :type explode_infos_prefix: str
 1615        :return: the value of the variable `explode_infos_prefix`.
 1616        """
 1617
 1618        if not explode_infos_prefix:
 1619            explode_infos_prefix = (
 1620                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1621            )
 1622
 1623        return explode_infos_prefix
 1624
 1625    def add_column(
 1626        self,
 1627        table_name,
 1628        column_name,
 1629        column_type,
 1630        default_value=None,
 1631        drop: bool = False,
 1632    ) -> dict:
 1633        """
 1634        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1635        doesn't already exist.
 1636
 1637        :param table_name: The name of the table to which you want to add a column
 1638        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1639        to the table
 1640        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1641        want to add to the table. It should be a string that represents the desired data type, such as
 1642        "INTEGER", "TEXT", "REAL", etc
 1643        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1644        default value for the newly added column. If a default value is provided, it will be assigned to
 1645        the column for any existing rows that do not have a value for that column
 1646        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1647        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1648        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1649        to False
 1650        :type drop: bool (optional)
 1651        :return: a boolean value indicating whether the column was successfully added to the table.
 1652        """
 1653
 1654        # added
 1655        added = False
 1656        dropped = False
 1657
 1658        # Check if the column already exists in the table
 1659        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1660        columns = self.get_query_to_df(query).columns.tolist()
 1661        if column_name.upper() in [c.upper() for c in columns]:
 1662            log.debug(
 1663                f"The {column_name} column already exists in the {table_name} table"
 1664            )
 1665            if drop:
 1666                self.drop_column(table_name=table_name, column_name=column_name)
 1667                dropped = True
 1668            else:
 1669                return None
 1670        else:
 1671            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1672
 1673        # Add column in table
 1674        add_column_query = (
 1675            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1676        )
 1677        if default_value is not None:
 1678            add_column_query += f" DEFAULT {default_value}"
 1679        self.execute_query(add_column_query)
 1680        added = not dropped
 1681        log.debug(
 1682            f"The {column_name} column was successfully added to the {table_name} table"
 1683        )
 1684
 1685        if added:
 1686            added_column = {
 1687                "table_name": table_name,
 1688                "column_name": column_name,
 1689                "column_type": column_type,
 1690                "default_value": default_value,
 1691            }
 1692        else:
 1693            added_column = None
 1694
 1695        return added_column
 1696
 1697    def drop_column(
 1698        self, column: dict = None, table_name: str = None, column_name: str = None
 1699    ) -> bool:
 1700        """
 1701        The `drop_column` function drops a specified column from a given table in a database and returns
 1702        True if the column was successfully dropped, and False if the column does not exist in the
 1703        table.
 1704
 1705        :param column: The `column` parameter is a dictionary that contains information about the column
 1706        you want to drop. It has two keys:
 1707        :type column: dict
 1708        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1709        drop a column
 1710        :type table_name: str
 1711        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1712        from the table
 1713        :type column_name: str
 1714        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1715        and False if the column does not exist in the table.
 1716        """
 1717
 1718        # Find column infos
 1719        if column:
 1720            if isinstance(column, dict):
 1721                table_name = column.get("table_name", None)
 1722                column_name = column.get("column_name", None)
 1723            elif isinstance(column, str):
 1724                table_name = self.get_table_variants()
 1725                column_name = column
 1726            else:
 1727                table_name = None
 1728                column_name = None
 1729
 1730        if not table_name and not column_name:
 1731            return False
 1732
 1733        # Removed
 1734        removed = False
 1735
 1736        # Check if the column already exists in the table
 1737        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1738        columns = self.get_query_to_df(query).columns.tolist()
 1739        if column_name in columns:
 1740            log.debug(f"The {column_name} column exists in the {table_name} table")
 1741        else:
 1742            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1743            return False
 1744
 1745        # Add column in table # ALTER TABLE integers DROP k
 1746        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1747        self.execute_query(add_column_query)
 1748        removed = True
 1749        log.debug(
 1750            f"The {column_name} column was successfully dropped to the {table_name} table"
 1751        )
 1752
 1753        return removed
 1754
 1755    def explode_infos(
 1756        self,
 1757        prefix: str = None,
 1758        create_index: bool = False,
 1759        fields: list = None,
 1760        force: bool = False,
 1761        proccess_all_fields_together: bool = False,
 1762        table: str = None,
 1763    ) -> list:
 1764        """
 1765        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1766        individual columns, returning a list of added columns.
 1767
 1768        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1769        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1770        `self.get_explode_infos_prefix()` as the prefix
 1771        :type prefix: str
 1772        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1773        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1774        `False`, indexes will not be created. The default value is `False`, defaults to False
 1775        :type create_index: bool (optional)
 1776        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1777        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1778        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1779        a list to the `
 1780        :type fields: list
 1781        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1782        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1783        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1784        defaults to False
 1785        :type force: bool (optional)
 1786        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1787        flag that determines whether to process all the INFO fields together or individually. If set to
 1788        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1789        be processed individually. The default value is, defaults to False
 1790        :type proccess_all_fields_together: bool (optional)
 1791        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1792        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1793        a value for the `table` parameter, the function will use that table name. If the `table`
 1794        parameter is
 1795        :type table: str
 1796        :return: The `explode_infos` function returns a list of added columns.
 1797        """
 1798
 1799        # drop indexes
 1800        self.drop_indexes()
 1801
 1802        # connexion format
 1803        connexion_format = self.get_connexion_format()
 1804
 1805        # Access
 1806        access = self.get_config().get("access", None)
 1807
 1808        # Added columns
 1809        added_columns = []
 1810
 1811        if access not in ["RO"]:
 1812
 1813            # prefix
 1814            if prefix in [None, True] or not isinstance(prefix, str):
 1815                if self.get_explode_infos_prefix() not in [None, True]:
 1816                    prefix = self.get_explode_infos_prefix()
 1817                else:
 1818                    prefix = "INFO/"
 1819
 1820            # table variants
 1821            if table is not None:
 1822                table_variants = table
 1823            else:
 1824                table_variants = self.get_table_variants(clause="select")
 1825
 1826            # extra infos
 1827            try:
 1828                extra_infos = self.get_extra_infos()
 1829            except:
 1830                extra_infos = []
 1831
 1832            # Header infos
 1833            header_infos = self.get_header().infos
 1834
 1835            log.debug(
 1836                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1837            )
 1838
 1839            sql_info_alter_table_array = []
 1840
 1841            # Info fields to check
 1842            fields_list = list(header_infos)
 1843            if fields:
 1844                fields_list += fields
 1845            fields_list = set(fields_list)
 1846
 1847            # If no fields
 1848            if not fields:
 1849                fields = []
 1850
 1851            # Translate fields if patterns
 1852            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1853
 1854            for info in fields:
 1855
 1856                info_id_sql = prefix + info
 1857
 1858                if (
 1859                    info in fields_list
 1860                    or prefix + info in fields_list
 1861                    or info in extra_infos
 1862                ):
 1863
 1864                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1865
 1866                    if info in header_infos:
 1867                        info_type = header_infos[info].type
 1868                        info_num = header_infos[info].num
 1869                    else:
 1870                        info_type = "String"
 1871                        info_num = 0
 1872
 1873                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1874                    if info_num != 1:
 1875                        type_sql = "VARCHAR"
 1876
 1877                    # Add field
 1878                    added_column = self.add_column(
 1879                        table_name=table_variants,
 1880                        column_name=info_id_sql,
 1881                        column_type=type_sql,
 1882                        default_value="null",
 1883                        drop=force,
 1884                    )
 1885
 1886                    if added_column:
 1887                        added_columns.append(added_column)
 1888
 1889                    if added_column or force:
 1890
 1891                        # add field to index
 1892                        self.index_additionnal_fields.append(info_id_sql)
 1893
 1894                        # Update field array
 1895                        if connexion_format in ["duckdb"]:
 1896                            update_info_field = f"""
 1897                            "{info_id_sql}" =
 1898                                CASE
 1899                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1900                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1901                                END
 1902                            """
 1903                        elif connexion_format in ["sqlite"]:
 1904                            update_info_field = f"""
 1905                                "{info_id_sql}" =
 1906                                    CASE
 1907                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1908                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1909                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1910                                    END
 1911                            """
 1912
 1913                        sql_info_alter_table_array.append(update_info_field)
 1914
 1915            if sql_info_alter_table_array:
 1916
 1917                # By chromosomes
 1918                try:
 1919                    chromosomes_list = list(
 1920                        self.get_query_to_df(
 1921                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1922                        )["#CHROM"]
 1923                    )
 1924                except:
 1925                    chromosomes_list = [None]
 1926
 1927                for chrom in chromosomes_list:
 1928                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1929
 1930                    # Where clause
 1931                    where_clause = ""
 1932                    if chrom and len(chromosomes_list) > 1:
 1933                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1934
 1935                    # Update table
 1936                    if proccess_all_fields_together:
 1937                        sql_info_alter_table_array_join = ", ".join(
 1938                            sql_info_alter_table_array
 1939                        )
 1940                        if sql_info_alter_table_array_join:
 1941                            sql_info_alter_table = f"""
 1942                                UPDATE {table_variants}
 1943                                SET {sql_info_alter_table_array_join}
 1944                                {where_clause}
 1945                                """
 1946                            log.debug(
 1947                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1948                            )
 1949                            # log.debug(sql_info_alter_table)
 1950                            self.conn.execute(sql_info_alter_table)
 1951                    else:
 1952                        sql_info_alter_num = 0
 1953                        for sql_info_alter in sql_info_alter_table_array:
 1954                            sql_info_alter_num += 1
 1955                            sql_info_alter_table = f"""
 1956                                UPDATE {table_variants}
 1957                                SET {sql_info_alter}
 1958                                {where_clause}
 1959                                """
 1960                            log.debug(
 1961                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1962                            )
 1963                            # log.debug(sql_info_alter_table)
 1964                            self.conn.execute(sql_info_alter_table)
 1965
 1966        # create indexes
 1967        if create_index:
 1968            self.create_indexes()
 1969
 1970        return added_columns
 1971
 1972    def create_indexes(self) -> None:
 1973        """
 1974        Create indexes on the table after insertion
 1975        """
 1976
 1977        # Access
 1978        access = self.get_config().get("access", None)
 1979
 1980        # get table variants
 1981        table_variants = self.get_table_variants("FROM")
 1982
 1983        if self.get_indexing() and access not in ["RO"]:
 1984            # Create index
 1985            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1986            self.conn.execute(sql_create_table_index)
 1987            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1988            self.conn.execute(sql_create_table_index)
 1989            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1990            self.conn.execute(sql_create_table_index)
 1991            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1992            self.conn.execute(sql_create_table_index)
 1993            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1994            self.conn.execute(sql_create_table_index)
 1995            for field in self.index_additionnal_fields:
 1996                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1997                self.conn.execute(sql_create_table_index)
 1998
 1999    def drop_indexes(self) -> None:
 2000        """
 2001        Create indexes on the table after insertion
 2002        """
 2003
 2004        # Access
 2005        access = self.get_config().get("access", None)
 2006
 2007        # get table variants
 2008        table_variants = self.get_table_variants("FROM")
 2009
 2010        # Get database format
 2011        connexion_format = self.get_connexion_format()
 2012
 2013        if access not in ["RO"]:
 2014            if connexion_format in ["duckdb"]:
 2015                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2016            elif connexion_format in ["sqlite"]:
 2017                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2018
 2019            list_indexes = self.conn.execute(sql_list_indexes)
 2020            index_names = [row[0] for row in list_indexes.fetchall()]
 2021            for index in index_names:
 2022                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2023                self.conn.execute(sql_drop_table_index)
 2024
 2025    def read_vcf_header(self, f) -> list:
 2026        """
 2027        It reads the header of a VCF file and returns a list of the header lines
 2028
 2029        :param f: the file object
 2030        :return: The header lines of the VCF file.
 2031        """
 2032
 2033        header_list = []
 2034        for line in f:
 2035            header_list.append(line)
 2036            if line.startswith("#CHROM"):
 2037                break
 2038        return header_list
 2039
 2040    def read_vcf_header_file(self, file: str = None) -> list:
 2041        """
 2042        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2043        uncompressed files.
 2044
 2045        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2046        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2047        default to `None`
 2048        :type file: str
 2049        :return: The function `read_vcf_header_file` returns a list.
 2050        """
 2051
 2052        if self.get_input_compressed(input_file=file):
 2053            with bgzf.open(file, "rt") as f:
 2054                return self.read_vcf_header(f=f)
 2055        else:
 2056            with open(file, "rt") as f:
 2057                return self.read_vcf_header(f=f)
 2058
 2059    def execute_query(self, query: str):
 2060        """
 2061        It takes a query as an argument, executes it, and returns the results
 2062
 2063        :param query: The query to be executed
 2064        :return: The result of the query is being returned.
 2065        """
 2066        if query:
 2067            return self.conn.execute(query)  # .fetchall()
 2068        else:
 2069            return None
 2070
 2071    def export_output(
 2072        self,
 2073        output_file: str | None = None,
 2074        output_header: str | None = None,
 2075        export_header: bool = True,
 2076        query: str | None = None,
 2077        parquet_partitions: list | None = None,
 2078        chunk_size: int | None = None,
 2079        threads: int | None = None,
 2080        sort: bool = False,
 2081        index: bool = False,
 2082        order_by: str | None = None,
 2083    ) -> bool:
 2084        """
 2085        The `export_output` function exports data from a VCF file to a specified output file in various
 2086        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2087
 2088        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2089        output file to be generated by the function. This is where the exported data will be saved
 2090        :type output_file: str
 2091        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2092        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2093        header will be exported to a file with the same name as the `output_file` parameter, but with
 2094        the extension "
 2095        :type output_header: str
 2096        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2097        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2098        True, the header will be exported to a file. If `export_header` is False, the header will not
 2099        be, defaults to True, if output format is not VCF
 2100        :type export_header: bool (optional)
 2101        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2102        select specific data from the VCF file before exporting it. If provided, only the data that
 2103        matches the query will be exported
 2104        :type query: str
 2105        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2106        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2107        organize data in a hierarchical directory structure based on the values of one or more columns.
 2108        This can improve query performance when working with large datasets
 2109        :type parquet_partitions: list
 2110        :param chunk_size: The `chunk_size` parameter specifies the number of
 2111        records in batch when exporting data in Parquet format. This parameter is used for
 2112        partitioning the Parquet file into multiple files.
 2113        :type chunk_size: int
 2114        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2115        threads to be used during the export process. It determines the level of parallelism and can
 2116        improve the performance of the export operation. If not provided, the function will use the
 2117        default number of threads
 2118        :type threads: int
 2119        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2120        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2121        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2122        False
 2123        :type sort: bool (optional)
 2124        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2125        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2126        no index will be created. The default value is False, defaults to False
 2127        :type index: bool (optional)
 2128        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2129        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2130        :type order_by: str
 2131        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2132        None if it doesn't.
 2133        """
 2134
 2135        # Log
 2136        log.info("Exporting...")
 2137
 2138        # Full path
 2139        output_file = full_path(output_file)
 2140        output_header = full_path(output_header)
 2141
 2142        # Config
 2143        config = self.get_config()
 2144
 2145        # Param
 2146        param = self.get_param()
 2147
 2148        # Tmp files to remove
 2149        tmp_to_remove = []
 2150
 2151        # If no output, get it
 2152        if not output_file:
 2153            output_file = self.get_output()
 2154
 2155        # If not threads
 2156        if not threads:
 2157            threads = self.get_threads()
 2158
 2159        # Auto header name with extension
 2160        if export_header or output_header:
 2161            if not output_header:
 2162                output_header = f"{output_file}.hdr"
 2163            # Export header
 2164            self.export_header(output_file=output_file)
 2165
 2166        # Switch off export header if VCF output
 2167        output_file_type = get_file_format(output_file)
 2168        if output_file_type in ["vcf"]:
 2169            export_header = False
 2170            tmp_to_remove.append(output_header)
 2171
 2172        # Chunk size
 2173        if not chunk_size:
 2174            chunk_size = config.get("chunk_size", None)
 2175
 2176        # Parquet partition
 2177        if not parquet_partitions:
 2178            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2179        if parquet_partitions and isinstance(parquet_partitions, str):
 2180            parquet_partitions = parquet_partitions.split(",")
 2181
 2182        # Order by
 2183        if not order_by:
 2184            order_by = param.get("export", {}).get("order_by", "")
 2185
 2186        # Header in output
 2187        header_in_output = param.get("export", {}).get("include_header", False)
 2188
 2189        # Database
 2190        database_source = self.get_connexion()
 2191
 2192        # Connexion format
 2193        connexion_format = self.get_connexion_format()
 2194
 2195        # Explode infos
 2196        if self.get_explode_infos():
 2197            self.explode_infos(
 2198                prefix=self.get_explode_infos_prefix(),
 2199                fields=self.get_explode_infos_fields(),
 2200                force=False,
 2201            )
 2202
 2203        # if connexion_format in ["sqlite"] or query:
 2204        if connexion_format in ["sqlite"]:
 2205
 2206            # Export in Parquet
 2207            random_tmp = "".join(
 2208                random.choice(string.ascii_lowercase) for i in range(10)
 2209            )
 2210            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2211            tmp_to_remove.append(database_source)
 2212
 2213            # Table Variants
 2214            table_variants = self.get_table_variants()
 2215
 2216            # Create export query
 2217            sql_query_export_subquery = f"""
 2218                SELECT * FROM {table_variants}
 2219                """
 2220
 2221            # Write source file
 2222            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2223
 2224        # Create database
 2225        database = Database(
 2226            database=database_source,
 2227            table="variants",
 2228            header_file=output_header,
 2229            conn_config=self.get_connexion_config(),
 2230        )
 2231
 2232        # Existing colomns header
 2233        existing_columns_header = database.get_header_columns_from_database()
 2234
 2235        # Sample list
 2236        if output_file_type in ["vcf"]:
 2237            get_samples = self.get_samples()
 2238            get_samples_check = self.get_samples_check()
 2239            samples_force = get_samples is not None
 2240            sample_list = self.get_header_sample_list(
 2241                check=get_samples_check,
 2242                samples=get_samples,
 2243                samples_force=samples_force,
 2244            )
 2245        else:
 2246            sample_list = None
 2247
 2248        # Export file
 2249        database.export(
 2250            output_database=output_file,
 2251            output_header=output_header,
 2252            existing_columns_header=existing_columns_header,
 2253            parquet_partitions=parquet_partitions,
 2254            chunk_size=chunk_size,
 2255            threads=threads,
 2256            sort=sort,
 2257            index=index,
 2258            header_in_output=header_in_output,
 2259            order_by=order_by,
 2260            query=query,
 2261            export_header=export_header,
 2262            sample_list=sample_list,
 2263        )
 2264
 2265        # Remove
 2266        remove_if_exists(tmp_to_remove)
 2267
 2268        return (os.path.exists(output_file) or None) and (
 2269            os.path.exists(output_file) or None
 2270        )
 2271
 2272    def get_extra_infos(self, table: str = None) -> list:
 2273        """
 2274        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2275        in the header.
 2276
 2277        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2278        name of the table from which you want to retrieve the extra columns that are not present in the
 2279        header. If the `table` parameter is not provided when calling the function, it will default to
 2280        using the variants
 2281        :type table: str
 2282        :return: A list of columns that are in the specified table but not in the header of the table.
 2283        """
 2284
 2285        header_columns = []
 2286
 2287        if not table:
 2288            table = self.get_table_variants(clause="from")
 2289            header_columns = self.get_header_columns()
 2290
 2291        # Check all columns in the database
 2292        query = f""" SELECT * FROM {table} LIMIT 1 """
 2293        log.debug(f"query {query}")
 2294        table_columns = self.get_query_to_df(query).columns.tolist()
 2295        extra_columns = []
 2296
 2297        # Construct extra infos (not in header)
 2298        for column in table_columns:
 2299            if column not in header_columns:
 2300                extra_columns.append(column)
 2301
 2302        return extra_columns
 2303
 2304    def get_extra_infos_sql(self, table: str = None) -> str:
 2305        """
 2306        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2307        by double quotes
 2308
 2309        :param table: The name of the table to get the extra infos from. If None, the default table is
 2310        used
 2311        :type table: str
 2312        :return: A string of the extra infos
 2313        """
 2314
 2315        return ", ".join(
 2316            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2317        )
 2318
 2319    def export_header(
 2320        self,
 2321        header_name: str = None,
 2322        output_file: str = None,
 2323        output_file_ext: str = ".hdr",
 2324        clean_header: bool = True,
 2325        remove_chrom_line: bool = False,
 2326    ) -> str:
 2327        """
 2328        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2329        specified options, and writes it to a new file.
 2330
 2331        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2332        this parameter is not specified, the header will be written to the output file
 2333        :type header_name: str
 2334        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2335        specify the name of the output file where the header will be written. If this parameter is not
 2336        provided, the header will be written to a temporary file
 2337        :type output_file: str
 2338        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2339        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2340        if not specified by the user. This extension will be appended to the `output_file` name to
 2341        create the final, defaults to .hdr
 2342        :type output_file_ext: str (optional)
 2343        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2344        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2345        `True`, the function will clean the header by modifying certain lines based on a specific
 2346        pattern. If `clean_header`, defaults to True
 2347        :type clean_header: bool (optional)
 2348        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2349        boolean flag that determines whether the #CHROM line should be removed from the header before
 2350        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2351        defaults to False
 2352        :type remove_chrom_line: bool (optional)
 2353        :return: The function `export_header` returns the name of the temporary header file that is
 2354        created.
 2355        """
 2356
 2357        if not header_name and not output_file:
 2358            output_file = self.get_output()
 2359
 2360        if self.get_header():
 2361
 2362            # Get header object
 2363            header_obj = self.get_header()
 2364
 2365            # Create database
 2366            db_for_header = Database(database=self.get_input())
 2367
 2368            # Get real columns in the file
 2369            db_header_columns = db_for_header.get_columns()
 2370
 2371            with tempfile.TemporaryDirectory() as tmpdir:
 2372
 2373                # Write header file
 2374                header_file_tmp = os.path.join(tmpdir, "header")
 2375                f = open(header_file_tmp, "w")
 2376                vcf.Writer(f, header_obj)
 2377                f.close()
 2378
 2379                # Replace #CHROM line with rel columns
 2380                header_list = db_for_header.read_header_file(
 2381                    header_file=header_file_tmp
 2382                )
 2383                header_list[-1] = "\t".join(db_header_columns)
 2384
 2385                # Remove CHROM line
 2386                if remove_chrom_line:
 2387                    header_list.pop()
 2388
 2389                # Clean header
 2390                if clean_header:
 2391                    header_list_clean = []
 2392                    for head in header_list:
 2393                        # Clean head for malformed header
 2394                        head_clean = head
 2395                        head_clean = re.subn(
 2396                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2397                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2398                            head_clean,
 2399                            2,
 2400                        )[0]
 2401                        # Write header
 2402                        header_list_clean.append(head_clean)
 2403                    header_list = header_list_clean
 2404
 2405            tmp_header_name = output_file + output_file_ext
 2406
 2407            f = open(tmp_header_name, "w")
 2408            for line in header_list:
 2409                f.write(line)
 2410            f.close()
 2411
 2412        return tmp_header_name
 2413
 2414    def export_variant_vcf(
 2415        self,
 2416        vcf_file,
 2417        remove_info: bool = False,
 2418        add_samples: bool = True,
 2419        list_samples: list = [],
 2420        where_clause: str = "",
 2421        index: bool = False,
 2422        threads: int | None = None,
 2423    ) -> bool | None:
 2424        """
 2425        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2426        remove INFO field, add samples, and control compression and indexing.
 2427
 2428        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2429        written to. It is the output file that will contain the filtered VCF data based on the specified
 2430        parameters
 2431        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2432        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2433        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2434        in, defaults to False
 2435        :type remove_info: bool (optional)
 2436        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2437        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2438        If set to False, the samples will be removed. The default value is True, defaults to True
 2439        :type add_samples: bool (optional)
 2440        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2441        in the output VCF file. By default, all samples will be included. If you provide a list of
 2442        samples, only those samples will be included in the output file
 2443        :type list_samples: list
 2444        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2445        determines whether or not to create an index for the output VCF file. If `index` is set to
 2446        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2447        :type index: bool (optional)
 2448        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2449        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2450        will be used during the export process. More threads can potentially speed up the export process
 2451        by utilizing multiple cores of the processor. If
 2452        :type threads: int | None
 2453        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2454        method with various parameters including the output file, query, threads, sort flag, and index
 2455        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2456        specified parameters and configurations provided in the `export_variant_vcf` function.
 2457        """
 2458
 2459        # Config
 2460        config = self.get_config()
 2461
 2462        # Extract VCF
 2463        log.debug("Export VCF...")
 2464
 2465        # Table variants
 2466        table_variants = self.get_table_variants()
 2467
 2468        # Threads
 2469        if not threads:
 2470            threads = self.get_threads()
 2471
 2472        # Info fields
 2473        if remove_info:
 2474            if not isinstance(remove_info, str):
 2475                remove_info = "."
 2476            info_field = f"""'{remove_info}' as INFO"""
 2477        else:
 2478            info_field = "INFO"
 2479
 2480        # Samples fields
 2481        if add_samples:
 2482            if not list_samples:
 2483                list_samples = self.get_header_sample_list()
 2484            if list_samples:
 2485                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2486            else:
 2487                samples_fields = ""
 2488            log.debug(f"samples_fields: {samples_fields}")
 2489        else:
 2490            samples_fields = ""
 2491
 2492        # Where clause
 2493        if where_clause is None:
 2494            where_clause = ""
 2495
 2496        # Variants
 2497        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2498        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2499        log.debug(f"sql_query_select={sql_query_select}")
 2500
 2501        return self.export_output(
 2502            output_file=vcf_file,
 2503            output_header=None,
 2504            export_header=True,
 2505            query=sql_query_select,
 2506            parquet_partitions=None,
 2507            chunk_size=config.get("chunk_size", None),
 2508            threads=threads,
 2509            sort=True,
 2510            index=index,
 2511            order_by=None,
 2512        )
 2513
 2514    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2515        """
 2516        It takes a list of commands and runs them in parallel using the number of threads specified
 2517
 2518        :param commands: A list of commands to run
 2519        :param threads: The number of threads to use, defaults to 1 (optional)
 2520        """
 2521
 2522        run_parallel_commands(commands, threads)
 2523
 2524    def get_threads(self, default: int = 1) -> int:
 2525        """
 2526        This function returns the number of threads to use for a job, with a default value of 1 if not
 2527        specified.
 2528
 2529        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2530        default number of threads to use if no specific value is provided. If no value is provided for
 2531        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2532        used, defaults to 1
 2533        :type default: int (optional)
 2534        :return: the number of threads to use for the current job.
 2535        """
 2536
 2537        # Config
 2538        config = self.get_config()
 2539
 2540        # Param
 2541        param = self.get_param()
 2542
 2543        # Input threads
 2544        input_thread = param.get("threads", config.get("threads", None))
 2545
 2546        # Check threads
 2547        if not input_thread:
 2548            threads = default
 2549        elif int(input_thread) <= 0:
 2550            threads = os.cpu_count()
 2551        else:
 2552            threads = int(input_thread)
 2553        return threads
 2554
 2555    def get_memory(self, default: str = None) -> str:
 2556        """
 2557        This function retrieves the memory value from parameters or configuration with a default value
 2558        if not found.
 2559
 2560        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2561        default value is used as a fallback in case the `memory` parameter is not provided in the
 2562        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2563        the function
 2564        :type default: str
 2565        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2566        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2567        return the default value provided as an argument to the function.
 2568        """
 2569
 2570        # Config
 2571        config = self.get_config()
 2572
 2573        # Param
 2574        param = self.get_param()
 2575
 2576        # Input threads
 2577        input_memory = param.get("memory", config.get("memory", None))
 2578
 2579        # Check threads
 2580        if input_memory:
 2581            memory = input_memory
 2582        else:
 2583            memory = default
 2584
 2585        return memory
 2586
 2587    def update_from_vcf(self, vcf_file: str) -> None:
 2588        """
 2589        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2590
 2591        :param vcf_file: the path to the VCF file
 2592        """
 2593
 2594        connexion_format = self.get_connexion_format()
 2595
 2596        if connexion_format in ["duckdb"]:
 2597            self.update_from_vcf_duckdb(vcf_file)
 2598        elif connexion_format in ["sqlite"]:
 2599            self.update_from_vcf_sqlite(vcf_file)
 2600
 2601    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2602        """
 2603        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2604        INFO column of the VCF file
 2605
 2606        :param vcf_file: the path to the VCF file
 2607        """
 2608
 2609        # varaints table
 2610        table_variants = self.get_table_variants()
 2611
 2612        # Loading VCF into temporaire table
 2613        skip = self.get_header_length(file=vcf_file)
 2614        vcf_df = pd.read_csv(
 2615            vcf_file,
 2616            sep="\t",
 2617            engine="c",
 2618            skiprows=skip,
 2619            header=0,
 2620            low_memory=False,
 2621        )
 2622        sql_query_update = f"""
 2623        UPDATE {table_variants} as table_variants
 2624            SET INFO = concat(
 2625                            CASE
 2626                                WHEN INFO NOT IN ('', '.')
 2627                                THEN INFO
 2628                                ELSE ''
 2629                            END,
 2630                            (
 2631                                SELECT 
 2632                                    concat(
 2633                                        CASE
 2634                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2635                                            THEN ';'
 2636                                            ELSE ''
 2637                                        END
 2638                                        ,
 2639                                        CASE
 2640                                            WHEN table_parquet.INFO NOT IN ('','.')
 2641                                            THEN table_parquet.INFO
 2642                                            ELSE ''
 2643                                        END
 2644                                    )
 2645                                FROM vcf_df as table_parquet
 2646                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2647                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2648                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2649                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2650                                        AND table_parquet.INFO NOT IN ('','.')
 2651                            )
 2652                        )
 2653            ;
 2654            """
 2655        self.conn.execute(sql_query_update)
 2656
 2657    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2658        """
 2659        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2660        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2661        table
 2662
 2663        :param vcf_file: The path to the VCF file you want to update the database with
 2664        """
 2665
 2666        # Create a temporary table for the VCF
 2667        table_vcf = "tmp_vcf"
 2668        sql_create = (
 2669            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2670        )
 2671        self.conn.execute(sql_create)
 2672
 2673        # Loading VCF into temporaire table
 2674        vcf_df = pd.read_csv(
 2675            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2676        )
 2677        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2678        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2679
 2680        # Update table 'variants' with VCF data
 2681        # warning: CONCAT as || operator
 2682        sql_query_update = f"""
 2683            UPDATE variants as table_variants
 2684            SET INFO = CASE
 2685                            WHEN INFO NOT IN ('', '.')
 2686                            THEN INFO
 2687                            ELSE ''
 2688                        END ||
 2689                        (
 2690                        SELECT 
 2691                            CASE 
 2692                                WHEN table_variants.INFO NOT IN ('','.') 
 2693                                    AND table_vcf.INFO NOT IN ('','.')  
 2694                                THEN ';' 
 2695                                ELSE '' 
 2696                            END || 
 2697                            CASE 
 2698                                WHEN table_vcf.INFO NOT IN ('','.') 
 2699                                THEN table_vcf.INFO 
 2700                                ELSE '' 
 2701                            END
 2702                        FROM {table_vcf} as table_vcf
 2703                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2704                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2705                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2706                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2707                        )
 2708        """
 2709        self.conn.execute(sql_query_update)
 2710
 2711        # Drop temporary table
 2712        sql_drop = f"DROP TABLE {table_vcf}"
 2713        self.conn.execute(sql_drop)
 2714
 2715    def drop_variants_table(self) -> None:
 2716        """
 2717        > This function drops the variants table
 2718        """
 2719
 2720        table_variants = self.get_table_variants()
 2721        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2722        self.conn.execute(sql_table_variants)
 2723
 2724    def set_variant_id(
 2725        self, variant_id_column: str = "variant_id", force: bool = None
 2726    ) -> str:
 2727        """
 2728        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2729        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2730
 2731        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2732        to variant_id
 2733        :type variant_id_column: str (optional)
 2734        :param force: If True, the variant_id column will be created even if it already exists
 2735        :type force: bool
 2736        :return: The name of the column that contains the variant_id
 2737        """
 2738
 2739        # Assembly
 2740        assembly = self.get_param().get(
 2741            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2742        )
 2743
 2744        # INFO/Tag prefix
 2745        prefix = self.get_explode_infos_prefix()
 2746
 2747        # Explode INFO/SVTYPE
 2748        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2749
 2750        # variants table
 2751        table_variants = self.get_table_variants()
 2752
 2753        # variant_id column
 2754        if not variant_id_column:
 2755            variant_id_column = "variant_id"
 2756
 2757        # Creta variant_id column
 2758        if "variant_id" not in self.get_extra_infos() or force:
 2759
 2760            # Create column
 2761            self.add_column(
 2762                table_name=table_variants,
 2763                column_name=variant_id_column,
 2764                column_type="UBIGINT",
 2765                default_value="0",
 2766            )
 2767
 2768            # Update column
 2769            self.conn.execute(
 2770                f"""
 2771                    UPDATE {table_variants}
 2772                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2773                """
 2774            )
 2775
 2776        # Remove added columns
 2777        for added_column in added_columns:
 2778            self.drop_column(column=added_column)
 2779
 2780        # return variant_id column name
 2781        return variant_id_column
 2782
 2783    def get_variant_id_column(
 2784        self, variant_id_column: str = "variant_id", force: bool = None
 2785    ) -> str:
 2786        """
 2787        This function returns the variant_id column name
 2788
 2789        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2790        defaults to variant_id
 2791        :type variant_id_column: str (optional)
 2792        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2793        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2794        if it is not already set, or if it is set
 2795        :type force: bool
 2796        :return: The variant_id column name.
 2797        """
 2798
 2799        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2800
 2801    ###
 2802    # Annotation
 2803    ###
 2804
 2805    def scan_databases(
 2806        self,
 2807        database_formats: list = ["parquet"],
 2808        database_releases: list = ["current"],
 2809    ) -> dict:
 2810        """
 2811        The function `scan_databases` scans for available databases based on specified formats and
 2812        releases.
 2813
 2814        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2815        of the databases to be scanned. In this case, the accepted format is "parquet"
 2816        :type database_formats: list ["parquet"]
 2817        :param database_releases: The `database_releases` parameter is a list that specifies the
 2818        releases of the databases to be scanned. In the provided function, the default value for
 2819        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2820        databases that are in the "current"
 2821        :type database_releases: list
 2822        :return: The function `scan_databases` returns a dictionary containing information about
 2823        databases that match the specified formats and releases.
 2824        """
 2825
 2826        # Config
 2827        config = self.get_config()
 2828
 2829        # Param
 2830        param = self.get_param()
 2831
 2832        # Param - Assembly
 2833        assembly = param.get("assembly", config.get("assembly", None))
 2834        if not assembly:
 2835            assembly = DEFAULT_ASSEMBLY
 2836            log.warning(f"Default assembly '{assembly}'")
 2837
 2838        # Scan for availabled databases
 2839        log.info(
 2840            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2841        )
 2842        databases_infos_dict = databases_infos(
 2843            database_folder_releases=database_releases,
 2844            database_formats=database_formats,
 2845            assembly=assembly,
 2846            config=config,
 2847        )
 2848        log.info(
 2849            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2850        )
 2851
 2852        return databases_infos_dict
 2853
 2854    def annotation(self) -> None:
 2855        """
 2856        It annotates the VCF file with the annotations specified in the config file.
 2857        """
 2858
 2859        # Config
 2860        config = self.get_config()
 2861
 2862        # Param
 2863        param = self.get_param()
 2864
 2865        # Param - Assembly
 2866        assembly = param.get("assembly", config.get("assembly", None))
 2867        if not assembly:
 2868            assembly = DEFAULT_ASSEMBLY
 2869            log.warning(f"Default assembly '{assembly}'")
 2870
 2871        # annotations databases folders
 2872        annotations_databases = set(
 2873            config.get("folders", {})
 2874            .get("databases", {})
 2875            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2876            + config.get("folders", {})
 2877            .get("databases", {})
 2878            .get("parquet", ["~/howard/databases/parquet/current"])
 2879            + config.get("folders", {})
 2880            .get("databases", {})
 2881            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2882        )
 2883
 2884        # Get param annotations
 2885        if param.get("annotations", None) and isinstance(
 2886            param.get("annotations", None), str
 2887        ):
 2888            log.debug(param.get("annotations", None))
 2889            param_annotation_list = param.get("annotations").split(",")
 2890        else:
 2891            param_annotation_list = []
 2892
 2893        # Each tools param
 2894        if param.get("annotation_parquet", None) != None:
 2895            log.debug(
 2896                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2897            )
 2898            if isinstance(param.get("annotation_parquet", None), list):
 2899                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2900            else:
 2901                param_annotation_list.append(param.get("annotation_parquet"))
 2902        if param.get("annotation_snpsift", None) != None:
 2903            if isinstance(param.get("annotation_snpsift", None), list):
 2904                param_annotation_list.append(
 2905                    "snpsift:"
 2906                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2907                )
 2908            else:
 2909                param_annotation_list.append(
 2910                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2911                )
 2912        if param.get("annotation_snpeff", None) != None:
 2913            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2914        if param.get("annotation_bcftools", None) != None:
 2915            if isinstance(param.get("annotation_bcftools", None), list):
 2916                param_annotation_list.append(
 2917                    "bcftools:"
 2918                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2919                )
 2920            else:
 2921                param_annotation_list.append(
 2922                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2923                )
 2924        if param.get("annotation_annovar", None) != None:
 2925            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2926        if param.get("annotation_exomiser", None) != None:
 2927            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2928        if param.get("annotation_splice", None) != None:
 2929            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2930
 2931        # Merge param annotations list
 2932        param["annotations"] = ",".join(param_annotation_list)
 2933
 2934        # debug
 2935        log.debug(f"param_annotations={param['annotations']}")
 2936
 2937        if param.get("annotations"):
 2938
 2939            # Log
 2940            # log.info("Annotations - Check annotation parameters")
 2941
 2942            if not "annotation" in param:
 2943                param["annotation"] = {}
 2944
 2945            # List of annotations parameters
 2946            annotations_list_input = {}
 2947            if isinstance(param.get("annotations", None), str):
 2948                annotation_file_list = [
 2949                    value for value in param.get("annotations", "").split(",")
 2950                ]
 2951                for annotation_file in annotation_file_list:
 2952                    annotations_list_input[annotation_file] = {"INFO": None}
 2953            else:
 2954                annotations_list_input = param.get("annotations", {})
 2955
 2956            log.info(f"Quick Annotations:")
 2957            for annotation_key in list(annotations_list_input.keys()):
 2958                log.info(f"   {annotation_key}")
 2959
 2960            # List of annotations and associated fields
 2961            annotations_list = {}
 2962
 2963            for annotation_file in annotations_list_input:
 2964
 2965                # Explode annotations if ALL
 2966                if (
 2967                    annotation_file.upper() == "ALL"
 2968                    or annotation_file.upper().startswith("ALL:")
 2969                ):
 2970
 2971                    # check ALL parameters (formats, releases)
 2972                    annotation_file_split = annotation_file.split(":")
 2973                    database_formats = "parquet"
 2974                    database_releases = "current"
 2975                    for annotation_file_option in annotation_file_split[1:]:
 2976                        database_all_options_split = annotation_file_option.split("=")
 2977                        if database_all_options_split[0] == "format":
 2978                            database_formats = database_all_options_split[1].split("+")
 2979                        if database_all_options_split[0] == "release":
 2980                            database_releases = database_all_options_split[1].split("+")
 2981
 2982                    # Scan for availabled databases
 2983                    databases_infos_dict = self.scan_databases(
 2984                        database_formats=database_formats,
 2985                        database_releases=database_releases,
 2986                    )
 2987
 2988                    # Add found databases in annotation parameters
 2989                    for database_infos in databases_infos_dict.keys():
 2990                        annotations_list[database_infos] = {"INFO": None}
 2991
 2992                else:
 2993                    annotations_list[annotation_file] = annotations_list_input[
 2994                        annotation_file
 2995                    ]
 2996
 2997            # Check each databases
 2998            if len(annotations_list):
 2999
 3000                log.info(
 3001                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 3002                )
 3003
 3004                for annotation_file in annotations_list:
 3005
 3006                    # Init
 3007                    annotations = annotations_list.get(annotation_file, None)
 3008
 3009                    # Annotation snpEff
 3010                    if annotation_file.startswith("snpeff"):
 3011
 3012                        log.debug(f"Quick Annotation snpEff")
 3013
 3014                        if "snpeff" not in param["annotation"]:
 3015                            param["annotation"]["snpeff"] = {}
 3016
 3017                        if "options" not in param["annotation"]["snpeff"]:
 3018                            param["annotation"]["snpeff"]["options"] = ""
 3019
 3020                        # snpEff options in annotations
 3021                        param["annotation"]["snpeff"]["options"] = "".join(
 3022                            annotation_file.split(":")[1:]
 3023                        )
 3024
 3025                    # Annotation Annovar
 3026                    elif annotation_file.startswith("annovar"):
 3027
 3028                        log.debug(f"Quick Annotation Annovar")
 3029
 3030                        if "annovar" not in param["annotation"]:
 3031                            param["annotation"]["annovar"] = {}
 3032
 3033                        if "annotations" not in param["annotation"]["annovar"]:
 3034                            param["annotation"]["annovar"]["annotations"] = {}
 3035
 3036                        # Options
 3037                        annotation_file_split = annotation_file.split(":")
 3038                        for annotation_file_annotation in annotation_file_split[1:]:
 3039                            if annotation_file_annotation:
 3040                                param["annotation"]["annovar"]["annotations"][
 3041                                    annotation_file_annotation
 3042                                ] = annotations
 3043
 3044                    # Annotation Exomiser
 3045                    elif annotation_file.startswith("exomiser"):
 3046
 3047                        log.debug(f"Quick Annotation Exomiser")
 3048
 3049                        param["annotation"]["exomiser"] = params_string_to_dict(
 3050                            annotation_file
 3051                        )
 3052
 3053                    # Annotation Splice
 3054                    elif annotation_file.startswith("splice"):
 3055
 3056                        log.debug(f"Quick Annotation Splice")
 3057
 3058                        param["annotation"]["splice"] = params_string_to_dict(
 3059                            annotation_file
 3060                        )
 3061
 3062                    # Annotation Parquet or BCFTOOLS
 3063                    else:
 3064
 3065                        # Tools detection
 3066                        if annotation_file.startswith("bcftools:"):
 3067                            annotation_tool_initial = "bcftools"
 3068                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3069                        elif annotation_file.startswith("snpsift:"):
 3070                            annotation_tool_initial = "snpsift"
 3071                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3072                        else:
 3073                            annotation_tool_initial = None
 3074
 3075                        # list of files
 3076                        annotation_file_list = annotation_file.replace("+", ":").split(
 3077                            ":"
 3078                        )
 3079
 3080                        for annotation_file in annotation_file_list:
 3081
 3082                            if annotation_file:
 3083
 3084                                # Annotation tool initial
 3085                                annotation_tool = annotation_tool_initial
 3086
 3087                                # Find file
 3088                                annotation_file_found = None
 3089
 3090                                # Expand user
 3091                                annotation_file = full_path(annotation_file)
 3092
 3093                                if os.path.exists(annotation_file):
 3094                                    annotation_file_found = annotation_file
 3095
 3096                                else:
 3097                                    # Find within assembly folders
 3098                                    for annotations_database in annotations_databases:
 3099                                        found_files = find_all(
 3100                                            annotation_file,
 3101                                            os.path.join(
 3102                                                annotations_database, assembly
 3103                                            ),
 3104                                        )
 3105                                        if len(found_files) > 0:
 3106                                            annotation_file_found = found_files[0]
 3107                                            break
 3108                                    if not annotation_file_found and not assembly:
 3109                                        # Find within folders
 3110                                        for (
 3111                                            annotations_database
 3112                                        ) in annotations_databases:
 3113                                            found_files = find_all(
 3114                                                annotation_file, annotations_database
 3115                                            )
 3116                                            if len(found_files) > 0:
 3117                                                annotation_file_found = found_files[0]
 3118                                                break
 3119                                log.debug(
 3120                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3121                                )
 3122
 3123                                # Full path
 3124                                annotation_file_found = full_path(annotation_file_found)
 3125
 3126                                if annotation_file_found:
 3127
 3128                                    database = Database(database=annotation_file_found)
 3129                                    quick_annotation_format = database.get_format()
 3130                                    quick_annotation_is_compressed = (
 3131                                        database.is_compressed()
 3132                                    )
 3133                                    quick_annotation_is_indexed = os.path.exists(
 3134                                        f"{annotation_file_found}.tbi"
 3135                                    )
 3136                                    bcftools_preference = False
 3137
 3138                                    # Check Annotation Tool
 3139                                    if not annotation_tool:
 3140                                        if (
 3141                                            bcftools_preference
 3142                                            and quick_annotation_format
 3143                                            in ["vcf", "bed"]
 3144                                            and quick_annotation_is_compressed
 3145                                            and quick_annotation_is_indexed
 3146                                        ):
 3147                                            annotation_tool = "bcftools"
 3148                                        elif quick_annotation_format in [
 3149                                            "vcf",
 3150                                            "bed",
 3151                                            "tsv",
 3152                                            "tsv",
 3153                                            "csv",
 3154                                            "json",
 3155                                            "tbl",
 3156                                            "parquet",
 3157                                            "duckdb",
 3158                                        ]:
 3159                                            annotation_tool = "parquet"
 3160                                        else:
 3161                                            log.error(
 3162                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3163                                            )
 3164                                            raise ValueError(
 3165                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3166                                            )
 3167
 3168                                    log.debug(
 3169                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3170                                    )
 3171
 3172                                    # Annotation Tool dispatch
 3173                                    if annotation_tool:
 3174                                        if annotation_tool not in param["annotation"]:
 3175                                            param["annotation"][annotation_tool] = {}
 3176                                        if (
 3177                                            "annotations"
 3178                                            not in param["annotation"][annotation_tool]
 3179                                        ):
 3180                                            param["annotation"][annotation_tool][
 3181                                                "annotations"
 3182                                            ] = {}
 3183                                        param["annotation"][annotation_tool][
 3184                                            "annotations"
 3185                                        ][annotation_file_found] = annotations
 3186
 3187                                else:
 3188                                    log.error(
 3189                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3190                                    )
 3191
 3192                self.set_param(param)
 3193
 3194        if param.get("annotation", None):
 3195            log.info("Annotations")
 3196            if param.get("annotation", {}).get("parquet", None):
 3197                log.info("Annotations 'parquet'...")
 3198                self.annotation_parquet()
 3199            if param.get("annotation", {}).get("bcftools", None):
 3200                log.info("Annotations 'bcftools'...")
 3201                self.annotation_bcftools()
 3202            if param.get("annotation", {}).get("snpsift", None):
 3203                log.info("Annotations 'snpsift'...")
 3204                self.annotation_snpsift()
 3205            if param.get("annotation", {}).get("annovar", None):
 3206                log.info("Annotations 'annovar'...")
 3207                self.annotation_annovar()
 3208            if param.get("annotation", {}).get("snpeff", None):
 3209                log.info("Annotations 'snpeff'...")
 3210                self.annotation_snpeff()
 3211            if param.get("annotation", {}).get("exomiser", None) is not None:
 3212                log.info("Annotations 'exomiser'...")
 3213                self.annotation_exomiser()
 3214            if param.get("annotation", {}).get("splice", None) is not None:
 3215                log.info("Annotations 'splice' ...")
 3216                self.annotation_splice()
 3217
 3218        # Explode INFOS fields into table fields
 3219        if self.get_explode_infos():
 3220            self.explode_infos(
 3221                prefix=self.get_explode_infos_prefix(),
 3222                fields=self.get_explode_infos_fields(),
 3223                force=True,
 3224            )
 3225
 3226    def annotation_snpsift(self, threads: int = None) -> None:
 3227        """
 3228        This function annotate with bcftools
 3229
 3230        :param threads: Number of threads to use
 3231        :return: the value of the variable "return_value".
 3232        """
 3233
 3234        # DEBUG
 3235        log.debug("Start annotation with bcftools databases")
 3236
 3237        # Threads
 3238        if not threads:
 3239            threads = self.get_threads()
 3240        log.debug("Threads: " + str(threads))
 3241
 3242        # Config
 3243        config = self.get_config()
 3244        log.debug("Config: " + str(config))
 3245
 3246        # Config - snpSift
 3247        snpsift_bin_command = get_bin_command(
 3248            bin="SnpSift.jar",
 3249            tool="snpsift",
 3250            bin_type="jar",
 3251            config=config,
 3252            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3253        )
 3254        if not snpsift_bin_command:
 3255            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3256            log.error(msg_err)
 3257            raise ValueError(msg_err)
 3258
 3259        # Config - bcftools
 3260        bcftools_bin_command = get_bin_command(
 3261            bin="bcftools",
 3262            tool="bcftools",
 3263            bin_type="bin",
 3264            config=config,
 3265            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3266        )
 3267        if not bcftools_bin_command:
 3268            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3269            log.error(msg_err)
 3270            raise ValueError(msg_err)
 3271
 3272        # Config - BCFTools databases folders
 3273        databases_folders = set(
 3274            self.get_config()
 3275            .get("folders", {})
 3276            .get("databases", {})
 3277            .get("annotations", ["."])
 3278            + self.get_config()
 3279            .get("folders", {})
 3280            .get("databases", {})
 3281            .get("bcftools", ["."])
 3282        )
 3283        log.debug("Databases annotations: " + str(databases_folders))
 3284
 3285        # Param
 3286        annotations = (
 3287            self.get_param()
 3288            .get("annotation", {})
 3289            .get("snpsift", {})
 3290            .get("annotations", None)
 3291        )
 3292        log.debug("Annotations: " + str(annotations))
 3293
 3294        # Assembly
 3295        assembly = self.get_param().get(
 3296            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3297        )
 3298
 3299        # Data
 3300        table_variants = self.get_table_variants()
 3301
 3302        # Check if not empty
 3303        log.debug("Check if not empty")
 3304        sql_query_chromosomes = (
 3305            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3306        )
 3307        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3308        if not sql_query_chromosomes_df["count"][0]:
 3309            log.info(f"VCF empty")
 3310            return
 3311
 3312        # VCF header
 3313        vcf_reader = self.get_header()
 3314        log.debug("Initial header: " + str(vcf_reader.infos))
 3315
 3316        # Existing annotations
 3317        for vcf_annotation in self.get_header().infos:
 3318
 3319            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3320            log.debug(
 3321                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3322            )
 3323
 3324        if annotations:
 3325
 3326            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3327
 3328                # Export VCF file
 3329                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3330
 3331                # Init
 3332                commands = {}
 3333
 3334                for annotation in annotations:
 3335                    annotation_fields = annotations[annotation]
 3336
 3337                    # Annotation Name
 3338                    annotation_name = os.path.basename(annotation)
 3339
 3340                    if not annotation_fields:
 3341                        annotation_fields = {"INFO": None}
 3342
 3343                    log.debug(f"Annotation '{annotation_name}'")
 3344                    log.debug(
 3345                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3346                    )
 3347
 3348                    # Create Database
 3349                    database = Database(
 3350                        database=annotation,
 3351                        databases_folders=databases_folders,
 3352                        assembly=assembly,
 3353                    )
 3354
 3355                    # Find files
 3356                    db_file = database.get_database()
 3357                    db_file = full_path(db_file)
 3358                    db_hdr_file = database.get_header_file()
 3359                    db_hdr_file = full_path(db_hdr_file)
 3360                    db_file_type = database.get_format()
 3361                    db_tbi_file = f"{db_file}.tbi"
 3362                    db_file_compressed = database.is_compressed()
 3363
 3364                    # Check if compressed
 3365                    if not db_file_compressed:
 3366                        log.error(
 3367                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3368                        )
 3369                        raise ValueError(
 3370                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3371                        )
 3372
 3373                    # Check if indexed
 3374                    if not os.path.exists(db_tbi_file):
 3375                        log.error(
 3376                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3377                        )
 3378                        raise ValueError(
 3379                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3380                        )
 3381
 3382                    # Check index - try to create if not exists
 3383                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3384                        log.error("Annotation failed: database not valid")
 3385                        log.error(f"Annotation annotation file: {db_file}")
 3386                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3387                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3388                        raise ValueError(
 3389                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3390                        )
 3391                    else:
 3392
 3393                        log.debug(
 3394                            f"Annotation '{annotation}' - file: "
 3395                            + str(db_file)
 3396                            + " and "
 3397                            + str(db_hdr_file)
 3398                        )
 3399
 3400                        # Load header as VCF object
 3401                        db_hdr_vcf = Variants(input=db_hdr_file)
 3402                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3403                        log.debug(
 3404                            "Annotation database header: "
 3405                            + str(db_hdr_vcf_header_infos)
 3406                        )
 3407
 3408                        # For all fields in database
 3409                        annotation_fields_full = False
 3410                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3411                            annotation_fields = {
 3412                                key: key for key in db_hdr_vcf_header_infos
 3413                            }
 3414                            log.debug(
 3415                                "Annotation database header - All annotations added: "
 3416                                + str(annotation_fields)
 3417                            )
 3418                            annotation_fields_full = True
 3419
 3420                        # # Create file for field rename
 3421                        # log.debug("Create file for field rename")
 3422                        # tmp_rename = NamedTemporaryFile(
 3423                        #     prefix=self.get_prefix(),
 3424                        #     dir=self.get_tmp_dir(),
 3425                        #     suffix=".rename",
 3426                        #     delete=False,
 3427                        # )
 3428                        # tmp_rename_name = tmp_rename.name
 3429                        # tmp_files.append(tmp_rename_name)
 3430
 3431                        # Number of fields
 3432                        nb_annotation_field = 0
 3433                        annotation_list = []
 3434                        annotation_infos_rename_list = []
 3435
 3436                        for annotation_field in annotation_fields:
 3437
 3438                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3439                            annotation_fields_new_name = annotation_fields.get(
 3440                                annotation_field, annotation_field
 3441                            )
 3442                            if not annotation_fields_new_name:
 3443                                annotation_fields_new_name = annotation_field
 3444
 3445                            # Check if field is in DB and if field is not elready in input data
 3446                            if (
 3447                                annotation_field in db_hdr_vcf.get_header().infos
 3448                                and annotation_fields_new_name
 3449                                not in self.get_header().infos
 3450                            ):
 3451
 3452                                log.info(
 3453                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3454                                )
 3455
 3456                                # BCFTools annotate param to rename fields
 3457                                if annotation_field != annotation_fields_new_name:
 3458                                    annotation_infos_rename_list.append(
 3459                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3460                                    )
 3461
 3462                                # Add INFO field to header
 3463                                db_hdr_vcf_header_infos_number = (
 3464                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3465                                )
 3466                                db_hdr_vcf_header_infos_type = (
 3467                                    db_hdr_vcf_header_infos[annotation_field].type
 3468                                    or "String"
 3469                                )
 3470                                db_hdr_vcf_header_infos_description = (
 3471                                    db_hdr_vcf_header_infos[annotation_field].desc
 3472                                    or f"{annotation_field} description"
 3473                                )
 3474                                db_hdr_vcf_header_infos_source = (
 3475                                    db_hdr_vcf_header_infos[annotation_field].source
 3476                                    or "unknown"
 3477                                )
 3478                                db_hdr_vcf_header_infos_version = (
 3479                                    db_hdr_vcf_header_infos[annotation_field].version
 3480                                    or "unknown"
 3481                                )
 3482
 3483                                vcf_reader.infos[annotation_fields_new_name] = (
 3484                                    vcf.parser._Info(
 3485                                        annotation_fields_new_name,
 3486                                        db_hdr_vcf_header_infos_number,
 3487                                        db_hdr_vcf_header_infos_type,
 3488                                        db_hdr_vcf_header_infos_description,
 3489                                        db_hdr_vcf_header_infos_source,
 3490                                        db_hdr_vcf_header_infos_version,
 3491                                        self.code_type_map[
 3492                                            db_hdr_vcf_header_infos_type
 3493                                        ],
 3494                                    )
 3495                                )
 3496
 3497                                annotation_list.append(annotation_field)
 3498
 3499                                nb_annotation_field += 1
 3500
 3501                            else:
 3502
 3503                                if (
 3504                                    annotation_field
 3505                                    not in db_hdr_vcf.get_header().infos
 3506                                ):
 3507                                    log.warning(
 3508                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3509                                    )
 3510                                if (
 3511                                    annotation_fields_new_name
 3512                                    in self.get_header().infos
 3513                                ):
 3514                                    log.warning(
 3515                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3516                                    )
 3517
 3518                        log.info(
 3519                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3520                        )
 3521
 3522                        annotation_infos = ",".join(annotation_list)
 3523
 3524                        if annotation_infos != "":
 3525
 3526                            # Annotated VCF (and error file)
 3527                            tmp_annotation_vcf_name = os.path.join(
 3528                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3529                            )
 3530                            tmp_annotation_vcf_name_err = (
 3531                                tmp_annotation_vcf_name + ".err"
 3532                            )
 3533
 3534                            # Add fields to annotate
 3535                            if not annotation_fields_full:
 3536                                annotation_infos_option = f"-info {annotation_infos}"
 3537                            else:
 3538                                annotation_infos_option = ""
 3539
 3540                            # Info fields rename
 3541                            if annotation_infos_rename_list:
 3542                                annotation_infos_rename = " -c " + ",".join(
 3543                                    annotation_infos_rename_list
 3544                                )
 3545                            else:
 3546                                annotation_infos_rename = ""
 3547
 3548                            # Annotate command
 3549                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3550
 3551                            # Add command
 3552                            commands[command_annotate] = tmp_annotation_vcf_name
 3553
 3554                if commands:
 3555
 3556                    # Export VCF file
 3557                    self.export_variant_vcf(
 3558                        vcf_file=tmp_vcf_name,
 3559                        remove_info=True,
 3560                        add_samples=False,
 3561                        index=True,
 3562                    )
 3563                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3564
 3565                    # Num command
 3566                    nb_command = 0
 3567
 3568                    # Annotate
 3569                    for command_annotate in commands:
 3570                        nb_command += 1
 3571                        log.info(
 3572                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3573                        )
 3574                        log.debug(f"command_annotate={command_annotate}")
 3575                        run_parallel_commands([command_annotate], threads)
 3576
 3577                        # Debug
 3578                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3579
 3580                        # Update variants
 3581                        log.info(
 3582                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3583                        )
 3584                        self.update_from_vcf(commands[command_annotate])
 3585
 3586    def annotation_bcftools(self, threads: int = None) -> None:
 3587        """
 3588        This function annotate with bcftools
 3589
 3590        :param threads: Number of threads to use
 3591        :return: the value of the variable "return_value".
 3592        """
 3593
 3594        # DEBUG
 3595        log.debug("Start annotation with bcftools databases")
 3596
 3597        # Threads
 3598        if not threads:
 3599            threads = self.get_threads()
 3600        log.debug("Threads: " + str(threads))
 3601
 3602        # Config
 3603        config = self.get_config()
 3604        log.debug("Config: " + str(config))
 3605
 3606        # DEBUG
 3607        delete_tmp = True
 3608        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3609            delete_tmp = False
 3610            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3611
 3612        # Config - BCFTools bin command
 3613        bcftools_bin_command = get_bin_command(
 3614            bin="bcftools",
 3615            tool="bcftools",
 3616            bin_type="bin",
 3617            config=config,
 3618            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3619        )
 3620        if not bcftools_bin_command:
 3621            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3622            log.error(msg_err)
 3623            raise ValueError(msg_err)
 3624
 3625        # Config - BCFTools databases folders
 3626        databases_folders = set(
 3627            self.get_config()
 3628            .get("folders", {})
 3629            .get("databases", {})
 3630            .get("annotations", ["."])
 3631            + self.get_config()
 3632            .get("folders", {})
 3633            .get("databases", {})
 3634            .get("bcftools", ["."])
 3635        )
 3636        log.debug("Databases annotations: " + str(databases_folders))
 3637
 3638        # Param
 3639        annotations = (
 3640            self.get_param()
 3641            .get("annotation", {})
 3642            .get("bcftools", {})
 3643            .get("annotations", None)
 3644        )
 3645        log.debug("Annotations: " + str(annotations))
 3646
 3647        # Assembly
 3648        assembly = self.get_param().get(
 3649            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3650        )
 3651
 3652        # Data
 3653        table_variants = self.get_table_variants()
 3654
 3655        # Check if not empty
 3656        log.debug("Check if not empty")
 3657        sql_query_chromosomes = (
 3658            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3659        )
 3660        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3661        if not sql_query_chromosomes_df["count"][0]:
 3662            log.info(f"VCF empty")
 3663            return
 3664
 3665        # Export in VCF
 3666        log.debug("Create initial file to annotate")
 3667        tmp_vcf = NamedTemporaryFile(
 3668            prefix=self.get_prefix(),
 3669            dir=self.get_tmp_dir(),
 3670            suffix=".vcf.gz",
 3671            delete=False,
 3672        )
 3673        tmp_vcf_name = tmp_vcf.name
 3674
 3675        # VCF header
 3676        vcf_reader = self.get_header()
 3677        log.debug("Initial header: " + str(vcf_reader.infos))
 3678
 3679        # Existing annotations
 3680        for vcf_annotation in self.get_header().infos:
 3681
 3682            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3683            log.debug(
 3684                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3685            )
 3686
 3687        if annotations:
 3688
 3689            tmp_ann_vcf_list = []
 3690            commands = []
 3691            tmp_files = []
 3692            err_files = []
 3693
 3694            for annotation in annotations:
 3695                annotation_fields = annotations[annotation]
 3696
 3697                # Annotation Name
 3698                annotation_name = os.path.basename(annotation)
 3699
 3700                if not annotation_fields:
 3701                    annotation_fields = {"INFO": None}
 3702
 3703                log.debug(f"Annotation '{annotation_name}'")
 3704                log.debug(
 3705                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3706                )
 3707
 3708                # Create Database
 3709                database = Database(
 3710                    database=annotation,
 3711                    databases_folders=databases_folders,
 3712                    assembly=assembly,
 3713                )
 3714
 3715                # Find files
 3716                db_file = database.get_database()
 3717                db_file = full_path(db_file)
 3718                db_hdr_file = database.get_header_file()
 3719                db_hdr_file = full_path(db_hdr_file)
 3720                db_file_type = database.get_format()
 3721                db_tbi_file = f"{db_file}.tbi"
 3722                db_file_compressed = database.is_compressed()
 3723
 3724                # Check if compressed
 3725                if not db_file_compressed:
 3726                    log.error(
 3727                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3728                    )
 3729                    raise ValueError(
 3730                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3731                    )
 3732
 3733                # Check if indexed
 3734                if not os.path.exists(db_tbi_file):
 3735                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3736                    raise ValueError(
 3737                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3738                    )
 3739
 3740                # Check index - try to create if not exists
 3741                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3742                    log.error("Annotation failed: database not valid")
 3743                    log.error(f"Annotation annotation file: {db_file}")
 3744                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3745                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3746                    raise ValueError(
 3747                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3748                    )
 3749                else:
 3750
 3751                    log.debug(
 3752                        f"Annotation '{annotation}' - file: "
 3753                        + str(db_file)
 3754                        + " and "
 3755                        + str(db_hdr_file)
 3756                    )
 3757
 3758                    # Load header as VCF object
 3759                    db_hdr_vcf = Variants(input=db_hdr_file)
 3760                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3761                    log.debug(
 3762                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3763                    )
 3764
 3765                    # For all fields in database
 3766                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3767                        annotation_fields = {
 3768                            key: key for key in db_hdr_vcf_header_infos
 3769                        }
 3770                        log.debug(
 3771                            "Annotation database header - All annotations added: "
 3772                            + str(annotation_fields)
 3773                        )
 3774
 3775                    # Number of fields
 3776                    nb_annotation_field = 0
 3777                    annotation_list = []
 3778
 3779                    for annotation_field in annotation_fields:
 3780
 3781                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3782                        annotation_fields_new_name = annotation_fields.get(
 3783                            annotation_field, annotation_field
 3784                        )
 3785                        if not annotation_fields_new_name:
 3786                            annotation_fields_new_name = annotation_field
 3787
 3788                        # Check if field is in DB and if field is not elready in input data
 3789                        if (
 3790                            annotation_field in db_hdr_vcf.get_header().infos
 3791                            and annotation_fields_new_name
 3792                            not in self.get_header().infos
 3793                        ):
 3794
 3795                            log.info(
 3796                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3797                            )
 3798
 3799                            # Add INFO field to header
 3800                            db_hdr_vcf_header_infos_number = (
 3801                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3802                            )
 3803                            db_hdr_vcf_header_infos_type = (
 3804                                db_hdr_vcf_header_infos[annotation_field].type
 3805                                or "String"
 3806                            )
 3807                            db_hdr_vcf_header_infos_description = (
 3808                                db_hdr_vcf_header_infos[annotation_field].desc
 3809                                or f"{annotation_field} description"
 3810                            )
 3811                            db_hdr_vcf_header_infos_source = (
 3812                                db_hdr_vcf_header_infos[annotation_field].source
 3813                                or "unknown"
 3814                            )
 3815                            db_hdr_vcf_header_infos_version = (
 3816                                db_hdr_vcf_header_infos[annotation_field].version
 3817                                or "unknown"
 3818                            )
 3819
 3820                            vcf_reader.infos[annotation_fields_new_name] = (
 3821                                vcf.parser._Info(
 3822                                    annotation_fields_new_name,
 3823                                    db_hdr_vcf_header_infos_number,
 3824                                    db_hdr_vcf_header_infos_type,
 3825                                    db_hdr_vcf_header_infos_description,
 3826                                    db_hdr_vcf_header_infos_source,
 3827                                    db_hdr_vcf_header_infos_version,
 3828                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3829                                )
 3830                            )
 3831
 3832                            # annotation_list.append(annotation_field)
 3833                            if annotation_field != annotation_fields_new_name:
 3834                                annotation_list.append(
 3835                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3836                                )
 3837                            else:
 3838                                annotation_list.append(annotation_field)
 3839
 3840                            nb_annotation_field += 1
 3841
 3842                        else:
 3843
 3844                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3845                                log.warning(
 3846                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3847                                )
 3848                            if annotation_fields_new_name in self.get_header().infos:
 3849                                log.warning(
 3850                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3851                                )
 3852
 3853                    log.info(
 3854                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3855                    )
 3856
 3857                    annotation_infos = ",".join(annotation_list)
 3858
 3859                    if annotation_infos != "":
 3860
 3861                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3862                        log.debug("Protect Header file - remove #CHROM line if exists")
 3863                        tmp_header_vcf = NamedTemporaryFile(
 3864                            prefix=self.get_prefix(),
 3865                            dir=self.get_tmp_dir(),
 3866                            suffix=".hdr",
 3867                            delete=False,
 3868                        )
 3869                        tmp_header_vcf_name = tmp_header_vcf.name
 3870                        tmp_files.append(tmp_header_vcf_name)
 3871                        # Command
 3872                        if db_hdr_file.endswith(".gz"):
 3873                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3874                        else:
 3875                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3876                        # Run
 3877                        run_parallel_commands([command_extract_header], 1)
 3878
 3879                        # Find chomosomes
 3880                        log.debug("Find chromosomes ")
 3881                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3882                        sql_query_chromosomes_df = self.get_query_to_df(
 3883                            sql_query_chromosomes
 3884                        )
 3885                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3886
 3887                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3888
 3889                        # BED columns in the annotation file
 3890                        if db_file_type in ["bed"]:
 3891                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3892
 3893                        for chrom in chomosomes_list:
 3894
 3895                            # Create BED on initial VCF
 3896                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3897                            tmp_bed = NamedTemporaryFile(
 3898                                prefix=self.get_prefix(),
 3899                                dir=self.get_tmp_dir(),
 3900                                suffix=".bed",
 3901                                delete=False,
 3902                            )
 3903                            tmp_bed_name = tmp_bed.name
 3904                            tmp_files.append(tmp_bed_name)
 3905
 3906                            # Detecte regions
 3907                            log.debug(
 3908                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3909                            )
 3910                            window = 1000000
 3911                            sql_query_intervals_for_bed = f"""
 3912                                SELECT  \"#CHROM\",
 3913                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3914                                        \"POS\"+{window}
 3915                                FROM {table_variants} as table_variants
 3916                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3917                            """
 3918                            regions = self.conn.execute(
 3919                                sql_query_intervals_for_bed
 3920                            ).fetchall()
 3921                            merged_regions = merge_regions(regions)
 3922                            log.debug(
 3923                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3924                            )
 3925
 3926                            header = ["#CHROM", "START", "END"]
 3927                            with open(tmp_bed_name, "w") as f:
 3928                                # Write the header with tab delimiter
 3929                                f.write("\t".join(header) + "\n")
 3930                                for d in merged_regions:
 3931                                    # Write each data row with tab delimiter
 3932                                    f.write("\t".join(map(str, d)) + "\n")
 3933
 3934                            # Tmp files
 3935                            tmp_annotation_vcf = NamedTemporaryFile(
 3936                                prefix=self.get_prefix(),
 3937                                dir=self.get_tmp_dir(),
 3938                                suffix=".vcf.gz",
 3939                                delete=False,
 3940                            )
 3941                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3942                            tmp_files.append(tmp_annotation_vcf_name)
 3943                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3944                            tmp_annotation_vcf_name_err = (
 3945                                tmp_annotation_vcf_name + ".err"
 3946                            )
 3947                            err_files.append(tmp_annotation_vcf_name_err)
 3948
 3949                            # Annotate Command
 3950                            log.debug(
 3951                                f"Annotation '{annotation}' - add bcftools command"
 3952                            )
 3953
 3954                            # Command
 3955                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3956
 3957                            # Add command
 3958                            commands.append(command_annotate)
 3959
 3960            # if some commands
 3961            if commands:
 3962
 3963                # Export VCF file
 3964                self.export_variant_vcf(
 3965                    vcf_file=tmp_vcf_name,
 3966                    remove_info=True,
 3967                    add_samples=False,
 3968                    index=True,
 3969                )
 3970
 3971                # Threads
 3972                # calculate threads for annotated commands
 3973                if commands:
 3974                    threads_bcftools_annotate = round(threads / len(commands))
 3975                else:
 3976                    threads_bcftools_annotate = 1
 3977
 3978                if not threads_bcftools_annotate:
 3979                    threads_bcftools_annotate = 1
 3980
 3981                # Add threads option to bcftools commands
 3982                if threads_bcftools_annotate > 1:
 3983                    commands_threaded = []
 3984                    for command in commands:
 3985                        commands_threaded.append(
 3986                            command.replace(
 3987                                f"{bcftools_bin_command} annotate ",
 3988                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3989                            )
 3990                        )
 3991                    commands = commands_threaded
 3992
 3993                # Command annotation multithreading
 3994                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3995                log.info(
 3996                    f"Annotation - Annotation multithreaded in "
 3997                    + str(len(commands))
 3998                    + " commands"
 3999                )
 4000
 4001                run_parallel_commands(commands, threads)
 4002
 4003                # Merge
 4004                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4005
 4006                if tmp_ann_vcf_list_cmd:
 4007
 4008                    # Tmp file
 4009                    tmp_annotate_vcf = NamedTemporaryFile(
 4010                        prefix=self.get_prefix(),
 4011                        dir=self.get_tmp_dir(),
 4012                        suffix=".vcf.gz",
 4013                        delete=True,
 4014                    )
 4015                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4016                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4017                    err_files.append(tmp_annotate_vcf_name_err)
 4018
 4019                    # Tmp file remove command
 4020                    tmp_files_remove_command = ""
 4021                    if tmp_files:
 4022                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4023
 4024                    # Command merge
 4025                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4026                    log.info(
 4027                        f"Annotation - Annotation merging "
 4028                        + str(len(commands))
 4029                        + " annotated files"
 4030                    )
 4031                    log.debug(f"Annotation - merge command: {merge_command}")
 4032                    run_parallel_commands([merge_command], 1)
 4033
 4034                    # Error messages
 4035                    log.info(f"Error/Warning messages:")
 4036                    error_message_command_all = []
 4037                    error_message_command_warning = []
 4038                    error_message_command_err = []
 4039                    for err_file in err_files:
 4040                        with open(err_file, "r") as f:
 4041                            for line in f:
 4042                                message = line.strip()
 4043                                error_message_command_all.append(message)
 4044                                if line.startswith("[W::"):
 4045                                    error_message_command_warning.append(message)
 4046                                if line.startswith("[E::"):
 4047                                    error_message_command_err.append(
 4048                                        f"{err_file}: " + message
 4049                                    )
 4050                    # log info
 4051                    for message in list(
 4052                        set(error_message_command_err + error_message_command_warning)
 4053                    ):
 4054                        log.info(f"   {message}")
 4055                    # debug info
 4056                    for message in list(set(error_message_command_all)):
 4057                        log.debug(f"   {message}")
 4058                    # failed
 4059                    if len(error_message_command_err):
 4060                        log.error("Annotation failed: Error in commands")
 4061                        raise ValueError("Annotation failed: Error in commands")
 4062
 4063                    # Update variants
 4064                    log.info(f"Annotation - Updating...")
 4065                    self.update_from_vcf(tmp_annotate_vcf_name)
 4066
 4067    def annotation_exomiser(self, threads: int = None) -> None:
 4068        """
 4069        This function annotate with Exomiser
 4070
 4071        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4072        - "analysis" (dict/file):
 4073            Full analysis dictionnary parameters (see Exomiser docs).
 4074            Either a dict, or a file in JSON or YAML format.
 4075            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4076            Default : None
 4077        - "preset" (string):
 4078            Analysis preset (available in config folder).
 4079            Used if no full "analysis" is provided.
 4080            Default: "exome"
 4081        - "phenopacket" (dict/file):
 4082            Samples and phenotipic features parameters (see Exomiser docs).
 4083            Either a dict, or a file in JSON or YAML format.
 4084            Default: None
 4085        - "subject" (dict):
 4086            Sample parameters (see Exomiser docs).
 4087            Example:
 4088                "subject":
 4089                    {
 4090                        "id": "ISDBM322017",
 4091                        "sex": "FEMALE"
 4092                    }
 4093            Default: None
 4094        - "sample" (string):
 4095            Sample name to construct "subject" section:
 4096                "subject":
 4097                    {
 4098                        "id": "<sample>",
 4099                        "sex": "UNKNOWN_SEX"
 4100                    }
 4101            Default: None
 4102        - "phenotypicFeatures" (dict)
 4103            Phenotypic features to construct "subject" section.
 4104            Example:
 4105                "phenotypicFeatures":
 4106                    [
 4107                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4108                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4109                    ]
 4110        - "hpo" (list)
 4111            List of HPO ids as phenotypic features.
 4112            Example:
 4113                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4114            Default: []
 4115        - "outputOptions" (dict):
 4116            Output options (see Exomiser docs).
 4117            Default:
 4118                "output_options" =
 4119                    {
 4120                        "outputContributingVariantsOnly": False,
 4121                        "numGenes": 0,
 4122                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4123                    }
 4124        - "transcript_source" (string):
 4125            Transcript source (either "refseq", "ucsc", "ensembl")
 4126            Default: "refseq"
 4127        - "exomiser_to_info" (boolean):
 4128            Add exomiser TSV file columns as INFO fields in VCF.
 4129            Default: False
 4130        - "release" (string):
 4131            Exomise database release.
 4132            If not exists, database release will be downloaded (take a while).
 4133            Default: None (provided by application.properties configuration file)
 4134        - "exomiser_application_properties" (file):
 4135            Exomiser configuration file (see Exomiser docs).
 4136            Useful to automatically download databases (especially for specific genome databases).
 4137
 4138        Notes:
 4139        - If no sample in parameters, first sample in VCF will be chosen
 4140        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4141
 4142        :param threads: The number of threads to use
 4143        :return: None.
 4144        """
 4145
 4146        # DEBUG
 4147        log.debug("Start annotation with Exomiser databases")
 4148
 4149        # Threads
 4150        if not threads:
 4151            threads = self.get_threads()
 4152        log.debug("Threads: " + str(threads))
 4153
 4154        # Config
 4155        config = self.get_config()
 4156        log.debug("Config: " + str(config))
 4157
 4158        # Config - Folders - Databases
 4159        databases_folders = (
 4160            config.get("folders", {})
 4161            .get("databases", {})
 4162            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4163        )
 4164        databases_folders = full_path(databases_folders)
 4165        if not os.path.exists(databases_folders):
 4166            log.error(f"Databases annotations: {databases_folders} NOT found")
 4167        log.debug("Databases annotations: " + str(databases_folders))
 4168
 4169        # Config - Exomiser
 4170        exomiser_bin_command = get_bin_command(
 4171            bin="exomiser-cli*.jar",
 4172            tool="exomiser",
 4173            bin_type="jar",
 4174            config=config,
 4175            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4176        )
 4177        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4178        if not exomiser_bin_command:
 4179            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4180            log.error(msg_err)
 4181            raise ValueError(msg_err)
 4182
 4183        # Param
 4184        param = self.get_param()
 4185        log.debug("Param: " + str(param))
 4186
 4187        # Param - Exomiser
 4188        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4189        log.debug(f"Param Exomiser: {param_exomiser}")
 4190
 4191        # Param - Assembly
 4192        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4193        log.debug("Assembly: " + str(assembly))
 4194
 4195        # Data
 4196        table_variants = self.get_table_variants()
 4197
 4198        # Check if not empty
 4199        log.debug("Check if not empty")
 4200        sql_query_chromosomes = (
 4201            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4202        )
 4203        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4204            log.info(f"VCF empty")
 4205            return False
 4206
 4207        # VCF header
 4208        vcf_reader = self.get_header()
 4209        log.debug("Initial header: " + str(vcf_reader.infos))
 4210
 4211        # Samples
 4212        samples = self.get_header_sample_list()
 4213        if not samples:
 4214            log.error("No Samples in VCF")
 4215            return False
 4216        log.debug(f"Samples: {samples}")
 4217
 4218        # Memory limit
 4219        memory_limit = self.get_memory("8G")
 4220        log.debug(f"memory_limit: {memory_limit}")
 4221
 4222        # Exomiser java options
 4223        exomiser_java_options = (
 4224            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4225        )
 4226        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4227
 4228        # Download Exomiser (if not exists)
 4229        exomiser_release = param_exomiser.get("release", None)
 4230        exomiser_application_properties = param_exomiser.get(
 4231            "exomiser_application_properties", None
 4232        )
 4233        databases_download_exomiser(
 4234            assemblies=[assembly],
 4235            exomiser_folder=databases_folders,
 4236            exomiser_release=exomiser_release,
 4237            exomiser_phenotype_release=exomiser_release,
 4238            exomiser_application_properties=exomiser_application_properties,
 4239        )
 4240
 4241        # Force annotation
 4242        force_update_annotation = True
 4243
 4244        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4245            log.debug("Start annotation Exomiser")
 4246
 4247            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4248
 4249                # tmp_dir = "/tmp/exomiser"
 4250
 4251                ### ANALYSIS ###
 4252                ################
 4253
 4254                # Create analysis.json through analysis dict
 4255                # either analysis in param or by default
 4256                # depending on preset exome/genome)
 4257
 4258                # Init analysis dict
 4259                param_exomiser_analysis_dict = {}
 4260
 4261                # analysis from param
 4262                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4263                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4264
 4265                # If analysis in param -> load anlaysis json
 4266                if param_exomiser_analysis:
 4267
 4268                    # If param analysis is a file and exists
 4269                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4270                        param_exomiser_analysis
 4271                    ):
 4272                        # Load analysis file into analysis dict (either yaml or json)
 4273                        with open(param_exomiser_analysis) as json_file:
 4274                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4275
 4276                    # If param analysis is a dict
 4277                    elif isinstance(param_exomiser_analysis, dict):
 4278                        # Load analysis dict into analysis dict (either yaml or json)
 4279                        param_exomiser_analysis_dict = param_exomiser_analysis
 4280
 4281                    # Error analysis type
 4282                    else:
 4283                        log.error(f"Analysis type unknown. Check param file.")
 4284                        raise ValueError(f"Analysis type unknown. Check param file.")
 4285
 4286                # Case no input analysis config file/dict
 4287                # Use preset (exome/genome) to open default config file
 4288                if not param_exomiser_analysis_dict:
 4289
 4290                    # default preset
 4291                    default_preset = "exome"
 4292
 4293                    # Get param preset or default preset
 4294                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4295
 4296                    # Try to find if preset is a file
 4297                    if os.path.exists(param_exomiser_preset):
 4298                        # Preset file is provided in full path
 4299                        param_exomiser_analysis_default_config_file = (
 4300                            param_exomiser_preset
 4301                        )
 4302                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4303                    #     # Preset file is provided in full path
 4304                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4305                    elif os.path.exists(
 4306                        os.path.join(folder_config, param_exomiser_preset)
 4307                    ):
 4308                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4309                        param_exomiser_analysis_default_config_file = os.path.join(
 4310                            folder_config, param_exomiser_preset
 4311                        )
 4312                    else:
 4313                        # Construct preset file
 4314                        param_exomiser_analysis_default_config_file = os.path.join(
 4315                            folder_config,
 4316                            f"preset-{param_exomiser_preset}-analysis.json",
 4317                        )
 4318
 4319                    # If preset file exists
 4320                    param_exomiser_analysis_default_config_file = full_path(
 4321                        param_exomiser_analysis_default_config_file
 4322                    )
 4323                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4324                        # Load prest file into analysis dict (either yaml or json)
 4325                        with open(
 4326                            param_exomiser_analysis_default_config_file
 4327                        ) as json_file:
 4328                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4329                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4330                                json_file
 4331                            )
 4332
 4333                    # Error preset file
 4334                    else:
 4335                        log.error(
 4336                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4337                        )
 4338                        raise ValueError(
 4339                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4340                        )
 4341
 4342                # If no analysis dict created
 4343                if not param_exomiser_analysis_dict:
 4344                    log.error(f"No analysis config")
 4345                    raise ValueError(f"No analysis config")
 4346
 4347                # Log
 4348                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4349
 4350                ### PHENOPACKET ###
 4351                ###################
 4352
 4353                # If no PhenoPacket in analysis dict -> check in param
 4354                if "phenopacket" not in param_exomiser_analysis_dict:
 4355
 4356                    # If PhenoPacket in param -> load anlaysis json
 4357                    if param_exomiser.get("phenopacket", None):
 4358
 4359                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4360                        param_exomiser_phenopacket = full_path(
 4361                            param_exomiser_phenopacket
 4362                        )
 4363
 4364                        # If param phenopacket is a file and exists
 4365                        if isinstance(
 4366                            param_exomiser_phenopacket, str
 4367                        ) and os.path.exists(param_exomiser_phenopacket):
 4368                            # Load phenopacket file into analysis dict (either yaml or json)
 4369                            with open(param_exomiser_phenopacket) as json_file:
 4370                                param_exomiser_analysis_dict["phenopacket"] = (
 4371                                    yaml.safe_load(json_file)
 4372                                )
 4373
 4374                        # If param phenopacket is a dict
 4375                        elif isinstance(param_exomiser_phenopacket, dict):
 4376                            # Load phenopacket dict into analysis dict (either yaml or json)
 4377                            param_exomiser_analysis_dict["phenopacket"] = (
 4378                                param_exomiser_phenopacket
 4379                            )
 4380
 4381                        # Error phenopacket type
 4382                        else:
 4383                            log.error(f"Phenopacket type unknown. Check param file.")
 4384                            raise ValueError(
 4385                                f"Phenopacket type unknown. Check param file."
 4386                            )
 4387
 4388                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4389                if "phenopacket" not in param_exomiser_analysis_dict:
 4390
 4391                    # Init PhenoPacket
 4392                    param_exomiser_analysis_dict["phenopacket"] = {
 4393                        "id": "analysis",
 4394                        "proband": {},
 4395                    }
 4396
 4397                    ### Add subject ###
 4398
 4399                    # If subject exists
 4400                    param_exomiser_subject = param_exomiser.get("subject", {})
 4401
 4402                    # If subject not exists -> found sample ID
 4403                    if not param_exomiser_subject:
 4404
 4405                        # Found sample ID in param
 4406                        sample = param_exomiser.get("sample", None)
 4407
 4408                        # Find sample ID (first sample)
 4409                        if not sample:
 4410                            sample_list = self.get_header_sample_list()
 4411                            if len(sample_list) > 0:
 4412                                sample = sample_list[0]
 4413                            else:
 4414                                log.error(f"No sample found")
 4415                                raise ValueError(f"No sample found")
 4416
 4417                        # Create subject
 4418                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4419
 4420                    # Add to dict
 4421                    param_exomiser_analysis_dict["phenopacket"][
 4422                        "subject"
 4423                    ] = param_exomiser_subject
 4424
 4425                    ### Add "phenotypicFeatures" ###
 4426
 4427                    # If phenotypicFeatures exists
 4428                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4429                        "phenotypicFeatures", []
 4430                    )
 4431
 4432                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4433                    if not param_exomiser_phenotypicfeatures:
 4434
 4435                        # Found HPO in param
 4436                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4437
 4438                        # Split HPO if list in string format separated by comma
 4439                        if isinstance(param_exomiser_hpo, str):
 4440                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4441
 4442                        # Create HPO list
 4443                        for hpo in param_exomiser_hpo:
 4444                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4445                            param_exomiser_phenotypicfeatures.append(
 4446                                {
 4447                                    "type": {
 4448                                        "id": f"HP:{hpo_clean}",
 4449                                        "label": f"HP:{hpo_clean}",
 4450                                    }
 4451                                }
 4452                            )
 4453
 4454                    # Add to dict
 4455                    param_exomiser_analysis_dict["phenopacket"][
 4456                        "phenotypicFeatures"
 4457                    ] = param_exomiser_phenotypicfeatures
 4458
 4459                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4460                    if not param_exomiser_phenotypicfeatures:
 4461                        for step in param_exomiser_analysis_dict.get(
 4462                            "analysis", {}
 4463                        ).get("steps", []):
 4464                            if "hiPhivePrioritiser" in step:
 4465                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4466                                    "steps", []
 4467                                ).remove(step)
 4468
 4469                ### Add Input File ###
 4470
 4471                # Initial file name and htsFiles
 4472                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4473                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4474                    {
 4475                        "uri": tmp_vcf_name,
 4476                        "htsFormat": "VCF",
 4477                        "genomeAssembly": assembly,
 4478                    }
 4479                ]
 4480
 4481                ### Add metaData ###
 4482
 4483                # If metaData not in analysis dict
 4484                if "metaData" not in param_exomiser_analysis_dict:
 4485                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4486                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4487                        "createdBy": "howard",
 4488                        "phenopacketSchemaVersion": 1,
 4489                    }
 4490
 4491                ### OutputOptions ###
 4492
 4493                # Init output result folder
 4494                output_results = os.path.join(tmp_dir, "results")
 4495
 4496                # If no outputOptions in analysis dict
 4497                if "outputOptions" not in param_exomiser_analysis_dict:
 4498
 4499                    # default output formats
 4500                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4501
 4502                    # Get outputOptions in param
 4503                    output_options = param_exomiser.get("outputOptions", None)
 4504
 4505                    # If no output_options in param -> check
 4506                    if not output_options:
 4507                        output_options = {
 4508                            "outputContributingVariantsOnly": False,
 4509                            "numGenes": 0,
 4510                            "outputFormats": defaut_output_formats,
 4511                        }
 4512
 4513                    # Replace outputDirectory in output options
 4514                    output_options["outputDirectory"] = output_results
 4515                    output_options["outputFileName"] = "howard"
 4516
 4517                    # Add outputOptions in analysis dict
 4518                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4519
 4520                else:
 4521
 4522                    # Replace output_results and output format (if exists in param)
 4523                    param_exomiser_analysis_dict["outputOptions"][
 4524                        "outputDirectory"
 4525                    ] = output_results
 4526                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4527                        list(
 4528                            set(
 4529                                param_exomiser_analysis_dict.get(
 4530                                    "outputOptions", {}
 4531                                ).get("outputFormats", [])
 4532                                + ["TSV_VARIANT", "VCF"]
 4533                            )
 4534                        )
 4535                    )
 4536
 4537                # log
 4538                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4539
 4540                ### ANALYSIS FILE ###
 4541                #####################
 4542
 4543                ### Full JSON analysis config file ###
 4544
 4545                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4546                with open(exomiser_analysis, "w") as fp:
 4547                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4548
 4549                ### SPLIT analysis and sample config files
 4550
 4551                # Splitted analysis dict
 4552                param_exomiser_analysis_dict_for_split = (
 4553                    param_exomiser_analysis_dict.copy()
 4554                )
 4555
 4556                # Phenopacket JSON file
 4557                exomiser_analysis_phenopacket = os.path.join(
 4558                    tmp_dir, "analysis_phenopacket.json"
 4559                )
 4560                with open(exomiser_analysis_phenopacket, "w") as fp:
 4561                    json.dump(
 4562                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4563                        fp,
 4564                        indent=4,
 4565                    )
 4566
 4567                # Analysis JSON file without Phenopacket parameters
 4568                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4569                exomiser_analysis_analysis = os.path.join(
 4570                    tmp_dir, "analysis_analysis.json"
 4571                )
 4572                with open(exomiser_analysis_analysis, "w") as fp:
 4573                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4574
 4575                ### INITAL VCF file ###
 4576                #######################
 4577
 4578                ### Create list of samples to use and include inti initial VCF file ####
 4579
 4580                # Subject (main sample)
 4581                # Get sample ID in analysis dict
 4582                sample_subject = (
 4583                    param_exomiser_analysis_dict.get("phenopacket", {})
 4584                    .get("subject", {})
 4585                    .get("id", None)
 4586                )
 4587                sample_proband = (
 4588                    param_exomiser_analysis_dict.get("phenopacket", {})
 4589                    .get("proband", {})
 4590                    .get("subject", {})
 4591                    .get("id", None)
 4592                )
 4593                sample = []
 4594                if sample_subject:
 4595                    sample.append(sample_subject)
 4596                if sample_proband:
 4597                    sample.append(sample_proband)
 4598
 4599                # Get sample ID within Pedigree
 4600                pedigree_persons_list = (
 4601                    param_exomiser_analysis_dict.get("phenopacket", {})
 4602                    .get("pedigree", {})
 4603                    .get("persons", {})
 4604                )
 4605
 4606                # Create list with all sample ID in pedigree (if exists)
 4607                pedigree_persons = []
 4608                for person in pedigree_persons_list:
 4609                    pedigree_persons.append(person.get("individualId"))
 4610
 4611                # Concat subject sample ID and samples ID in pedigreesamples
 4612                samples = list(set(sample + pedigree_persons))
 4613
 4614                # Check if sample list is not empty
 4615                if not samples:
 4616                    log.error(f"No samples found")
 4617                    raise ValueError(f"No samples found")
 4618
 4619                # Create VCF with sample (either sample in param or first one by default)
 4620                # Export VCF file
 4621                self.export_variant_vcf(
 4622                    vcf_file=tmp_vcf_name,
 4623                    remove_info=True,
 4624                    add_samples=True,
 4625                    list_samples=samples,
 4626                    index=False,
 4627                )
 4628
 4629                ### Execute Exomiser ###
 4630                ########################
 4631
 4632                # Init command
 4633                exomiser_command = ""
 4634
 4635                # Command exomiser options
 4636                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4637
 4638                # Release
 4639                exomiser_release = param_exomiser.get("release", None)
 4640                if exomiser_release:
 4641                    # phenotype data version
 4642                    exomiser_options += (
 4643                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4644                    )
 4645                    # data version
 4646                    exomiser_options += (
 4647                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4648                    )
 4649                    # variant white list
 4650                    variant_white_list_file = (
 4651                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4652                    )
 4653                    if os.path.exists(
 4654                        os.path.join(
 4655                            databases_folders, assembly, variant_white_list_file
 4656                        )
 4657                    ):
 4658                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4659
 4660                # transcript_source
 4661                transcript_source = param_exomiser.get(
 4662                    "transcript_source", None
 4663                )  # ucsc, refseq, ensembl
 4664                if transcript_source:
 4665                    exomiser_options += (
 4666                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4667                    )
 4668
 4669                # If analysis contain proband param
 4670                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4671                    "proband", {}
 4672                ):
 4673                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4674
 4675                # If no proband (usually uniq sample)
 4676                else:
 4677                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4678
 4679                # Log
 4680                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4681
 4682                # Run command
 4683                result = subprocess.call(
 4684                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4685                )
 4686                if result:
 4687                    log.error("Exomiser command failed")
 4688                    raise ValueError("Exomiser command failed")
 4689
 4690                ### RESULTS ###
 4691                ###############
 4692
 4693                ### Annotate with TSV fields ###
 4694
 4695                # Init result tsv file
 4696                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4697
 4698                # Init result tsv file
 4699                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4700
 4701                # Parse TSV file and explode columns in INFO field
 4702                if exomiser_to_info and os.path.exists(output_results_tsv):
 4703
 4704                    # Log
 4705                    log.debug("Exomiser columns to VCF INFO field")
 4706
 4707                    # Retrieve columns and types
 4708                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4709                    output_results_tsv_df = self.get_query_to_df(query)
 4710                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4711
 4712                    # Init concat fields for update
 4713                    sql_query_update_concat_fields = []
 4714
 4715                    # Fields to avoid
 4716                    fields_to_avoid = [
 4717                        "CONTIG",
 4718                        "START",
 4719                        "END",
 4720                        "REF",
 4721                        "ALT",
 4722                        "QUAL",
 4723                        "FILTER",
 4724                        "GENOTYPE",
 4725                    ]
 4726
 4727                    # List all columns to add into header
 4728                    for header_column in output_results_tsv_columns:
 4729
 4730                        # If header column is enable
 4731                        if header_column not in fields_to_avoid:
 4732
 4733                            # Header info type
 4734                            header_info_type = "String"
 4735                            header_column_df = output_results_tsv_df[header_column]
 4736                            header_column_df_dtype = header_column_df.dtype
 4737                            if header_column_df_dtype == object:
 4738                                if (
 4739                                    pd.to_numeric(header_column_df, errors="coerce")
 4740                                    .notnull()
 4741                                    .all()
 4742                                ):
 4743                                    header_info_type = "Float"
 4744                            else:
 4745                                header_info_type = "Integer"
 4746
 4747                            # Header info
 4748                            characters_to_validate = ["-"]
 4749                            pattern = "[" + "".join(characters_to_validate) + "]"
 4750                            header_info_name = re.sub(
 4751                                pattern,
 4752                                "_",
 4753                                f"Exomiser_{header_column}".replace("#", ""),
 4754                            )
 4755                            header_info_number = "."
 4756                            header_info_description = (
 4757                                f"Exomiser {header_column} annotation"
 4758                            )
 4759                            header_info_source = "Exomiser"
 4760                            header_info_version = "unknown"
 4761                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4762                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4763                                header_info_name,
 4764                                header_info_number,
 4765                                header_info_type,
 4766                                header_info_description,
 4767                                header_info_source,
 4768                                header_info_version,
 4769                                header_info_code,
 4770                            )
 4771
 4772                            # Add field to add for update to concat fields
 4773                            sql_query_update_concat_fields.append(
 4774                                f"""
 4775                                CASE
 4776                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4777                                    THEN concat(
 4778                                        '{header_info_name}=',
 4779                                        table_parquet."{header_column}",
 4780                                        ';'
 4781                                        )
 4782
 4783                                    ELSE ''
 4784                                END
 4785                            """
 4786                            )
 4787
 4788                    # Update query
 4789                    sql_query_update = f"""
 4790                        UPDATE {table_variants} as table_variants
 4791                            SET INFO = concat(
 4792                                            CASE
 4793                                                WHEN INFO NOT IN ('', '.')
 4794                                                THEN INFO
 4795                                                ELSE ''
 4796                                            END,
 4797                                            CASE
 4798                                                WHEN table_variants.INFO NOT IN ('','.')
 4799                                                THEN ';'
 4800                                                ELSE ''
 4801                                            END,
 4802                                            (
 4803                                            SELECT 
 4804                                                concat(
 4805                                                    {",".join(sql_query_update_concat_fields)}
 4806                                                )
 4807                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4808                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4809                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4810                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4811                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4812                                            )
 4813                                        )
 4814                            ;
 4815                        """
 4816
 4817                    # Update
 4818                    self.conn.execute(sql_query_update)
 4819
 4820                ### Annotate with VCF INFO field ###
 4821
 4822                # Init result VCF file
 4823                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4824
 4825                # If VCF exists
 4826                if os.path.exists(output_results_vcf):
 4827
 4828                    # Log
 4829                    log.debug("Exomiser result VCF update variants")
 4830
 4831                    # Find Exomiser INFO field annotation in header
 4832                    with gzip.open(output_results_vcf, "rt") as f:
 4833                        header_list = self.read_vcf_header(f)
 4834                    exomiser_vcf_header = vcf.Reader(
 4835                        io.StringIO("\n".join(header_list))
 4836                    )
 4837
 4838                    # Add annotation INFO field to header
 4839                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4840
 4841                    # Update variants with VCF
 4842                    self.update_from_vcf(output_results_vcf)
 4843
 4844        return True
 4845
 4846    def annotation_snpeff(self, threads: int = None) -> None:
 4847        """
 4848        This function annotate with snpEff
 4849
 4850        :param threads: The number of threads to use
 4851        :return: the value of the variable "return_value".
 4852        """
 4853
 4854        # DEBUG
 4855        log.debug("Start annotation with snpeff databases")
 4856
 4857        # Threads
 4858        if not threads:
 4859            threads = self.get_threads()
 4860        log.debug("Threads: " + str(threads))
 4861
 4862        # DEBUG
 4863        delete_tmp = True
 4864        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4865            delete_tmp = False
 4866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4867
 4868        # Config
 4869        config = self.get_config()
 4870        log.debug("Config: " + str(config))
 4871
 4872        # Config - Folders - Databases
 4873        databases_folders = (
 4874            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4875        )
 4876        log.debug("Databases annotations: " + str(databases_folders))
 4877
 4878        # # Config - Java
 4879        # java_bin = get_bin(
 4880        #     tool="java",
 4881        #     bin="java",
 4882        #     bin_type="bin",
 4883        #     config=config,
 4884        #     default_folder="/usr/bin",
 4885        # )
 4886        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4887        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4888        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4889
 4890        # # Config - snpEff bin
 4891        # snpeff_jar = get_bin(
 4892        #     tool="snpeff",
 4893        #     bin="snpEff.jar",
 4894        #     bin_type="jar",
 4895        #     config=config,
 4896        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4897        # )
 4898        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4899        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4900        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4901
 4902        # Config - snpEff bin command
 4903        snpeff_bin_command = get_bin_command(
 4904            bin="snpEff.jar",
 4905            tool="snpeff",
 4906            bin_type="jar",
 4907            config=config,
 4908            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4909        )
 4910        if not snpeff_bin_command:
 4911            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4912            log.error(msg_err)
 4913            raise ValueError(msg_err)
 4914
 4915        # Config - snpEff databases
 4916        snpeff_databases = (
 4917            config.get("folders", {})
 4918            .get("databases", {})
 4919            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4920        )
 4921        snpeff_databases = full_path(snpeff_databases)
 4922        if snpeff_databases is not None and snpeff_databases != "":
 4923            log.debug(f"Create snpEff databases folder")
 4924            if not os.path.exists(snpeff_databases):
 4925                os.makedirs(snpeff_databases)
 4926
 4927        # Param
 4928        param = self.get_param()
 4929        log.debug("Param: " + str(param))
 4930
 4931        # Param
 4932        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4933        log.debug("Options: " + str(options))
 4934
 4935        # Param - Assembly
 4936        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4937
 4938        # Param - Options
 4939        snpeff_options = (
 4940            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4941        )
 4942        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4943        snpeff_csvstats = (
 4944            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4945        )
 4946        if snpeff_stats:
 4947            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4948            snpeff_stats = full_path(snpeff_stats)
 4949            snpeff_options += f" -stats {snpeff_stats}"
 4950        if snpeff_csvstats:
 4951            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4952            snpeff_csvstats = full_path(snpeff_csvstats)
 4953            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4954
 4955        # Data
 4956        table_variants = self.get_table_variants()
 4957
 4958        # Check if not empty
 4959        log.debug("Check if not empty")
 4960        sql_query_chromosomes = (
 4961            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4962        )
 4963        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4964        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4965            log.info(f"VCF empty")
 4966            return
 4967
 4968        # Export in VCF
 4969        log.debug("Create initial file to annotate")
 4970        tmp_vcf = NamedTemporaryFile(
 4971            prefix=self.get_prefix(),
 4972            dir=self.get_tmp_dir(),
 4973            suffix=".vcf.gz",
 4974            delete=True,
 4975        )
 4976        tmp_vcf_name = tmp_vcf.name
 4977
 4978        # VCF header
 4979        vcf_reader = self.get_header()
 4980        log.debug("Initial header: " + str(vcf_reader.infos))
 4981
 4982        # Existing annotations
 4983        for vcf_annotation in self.get_header().infos:
 4984
 4985            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4986            log.debug(
 4987                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4988            )
 4989
 4990        # Memory limit
 4991        # if config.get("memory", None):
 4992        #     memory_limit = config.get("memory", "8G")
 4993        # else:
 4994        #     memory_limit = "8G"
 4995        memory_limit = self.get_memory("8G")
 4996        log.debug(f"memory_limit: {memory_limit}")
 4997
 4998        # snpEff java options
 4999        snpeff_java_options = (
 5000            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 5001        )
 5002        log.debug(f"Exomiser java options: {snpeff_java_options}")
 5003
 5004        force_update_annotation = True
 5005
 5006        if "ANN" not in self.get_header().infos or force_update_annotation:
 5007
 5008            # Check snpEff database
 5009            log.debug(f"Check snpEff databases {[assembly]}")
 5010            databases_download_snpeff(
 5011                folder=snpeff_databases, assemblies=[assembly], config=config
 5012            )
 5013
 5014            # Export VCF file
 5015            self.export_variant_vcf(
 5016                vcf_file=tmp_vcf_name,
 5017                remove_info=True,
 5018                add_samples=False,
 5019                index=True,
 5020            )
 5021
 5022            # Tmp file
 5023            err_files = []
 5024            tmp_annotate_vcf = NamedTemporaryFile(
 5025                prefix=self.get_prefix(),
 5026                dir=self.get_tmp_dir(),
 5027                suffix=".vcf",
 5028                delete=False,
 5029            )
 5030            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5031            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5032            err_files.append(tmp_annotate_vcf_name_err)
 5033
 5034            # Command
 5035            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5036            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5037            run_parallel_commands([snpeff_command], 1)
 5038
 5039            # Error messages
 5040            log.info(f"Error/Warning messages:")
 5041            error_message_command_all = []
 5042            error_message_command_warning = []
 5043            error_message_command_err = []
 5044            for err_file in err_files:
 5045                with open(err_file, "r") as f:
 5046                    for line in f:
 5047                        message = line.strip()
 5048                        error_message_command_all.append(message)
 5049                        if line.startswith("[W::"):
 5050                            error_message_command_warning.append(message)
 5051                        if line.startswith("[E::"):
 5052                            error_message_command_err.append(f"{err_file}: " + message)
 5053            # log info
 5054            for message in list(
 5055                set(error_message_command_err + error_message_command_warning)
 5056            ):
 5057                log.info(f"   {message}")
 5058            # debug info
 5059            for message in list(set(error_message_command_all)):
 5060                log.debug(f"   {message}")
 5061            # failed
 5062            if len(error_message_command_err):
 5063                log.error("Annotation failed: Error in commands")
 5064                raise ValueError("Annotation failed: Error in commands")
 5065
 5066            # Find annotation in header
 5067            with open(tmp_annotate_vcf_name, "rt") as f:
 5068                header_list = self.read_vcf_header(f)
 5069            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5070
 5071            for ann in annovar_vcf_header.infos:
 5072                if ann not in self.get_header().infos:
 5073                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5074
 5075            # Update variants
 5076            log.info(f"Annotation - Updating...")
 5077            self.update_from_vcf(tmp_annotate_vcf_name)
 5078
 5079        else:
 5080            if "ANN" in self.get_header().infos:
 5081                log.debug(f"Existing snpEff annotations in VCF")
 5082            if force_update_annotation:
 5083                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5084
 5085    def annotation_annovar(self, threads: int = None) -> None:
 5086        """
 5087        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5088        annotations
 5089
 5090        :param threads: number of threads to use
 5091        :return: the value of the variable "return_value".
 5092        """
 5093
 5094        # DEBUG
 5095        log.debug("Start annotation with Annovar databases")
 5096
 5097        # Threads
 5098        if not threads:
 5099            threads = self.get_threads()
 5100        log.debug("Threads: " + str(threads))
 5101
 5102        # Tmp en Err files
 5103        tmp_files = []
 5104        err_files = []
 5105
 5106        # DEBUG
 5107        delete_tmp = True
 5108        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5109            delete_tmp = False
 5110            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5111
 5112        # Config
 5113        config = self.get_config()
 5114        log.debug("Config: " + str(config))
 5115
 5116        # Config - Folders - Databases
 5117        databases_folders = (
 5118            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5119        )
 5120        log.debug("Databases annotations: " + str(databases_folders))
 5121
 5122        # Config - annovar bin command
 5123        annovar_bin_command = get_bin_command(
 5124            bin="table_annovar.pl",
 5125            tool="annovar",
 5126            bin_type="perl",
 5127            config=config,
 5128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5129        )
 5130        if not annovar_bin_command:
 5131            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5132            log.error(msg_err)
 5133            raise ValueError(msg_err)
 5134
 5135        # Config - BCFTools bin command
 5136        bcftools_bin_command = get_bin_command(
 5137            bin="bcftools",
 5138            tool="bcftools",
 5139            bin_type="bin",
 5140            config=config,
 5141            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5142        )
 5143        if not bcftools_bin_command:
 5144            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5145            log.error(msg_err)
 5146            raise ValueError(msg_err)
 5147
 5148        # Config - annovar databases
 5149        annovar_databases = (
 5150            config.get("folders", {})
 5151            .get("databases", {})
 5152            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5153        )
 5154        annovar_databases = full_path(annovar_databases)
 5155        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5156            os.makedirs(annovar_databases)
 5157
 5158        # Param
 5159        param = self.get_param()
 5160        log.debug("Param: " + str(param))
 5161
 5162        # Param - options
 5163        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5164        log.debug("Options: " + str(options))
 5165
 5166        # Param - annotations
 5167        annotations = (
 5168            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5169        )
 5170        log.debug("Annotations: " + str(annotations))
 5171
 5172        # Param - Assembly
 5173        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5174
 5175        # Annovar database assembly
 5176        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5177        if annovar_databases_assembly != "" and not os.path.exists(
 5178            annovar_databases_assembly
 5179        ):
 5180            os.makedirs(annovar_databases_assembly)
 5181
 5182        # Data
 5183        table_variants = self.get_table_variants()
 5184
 5185        # Check if not empty
 5186        log.debug("Check if not empty")
 5187        sql_query_chromosomes = (
 5188            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5189        )
 5190        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5191        if not sql_query_chromosomes_df["count"][0]:
 5192            log.info(f"VCF empty")
 5193            return
 5194
 5195        # VCF header
 5196        vcf_reader = self.get_header()
 5197        log.debug("Initial header: " + str(vcf_reader.infos))
 5198
 5199        # Existing annotations
 5200        for vcf_annotation in self.get_header().infos:
 5201
 5202            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5203            log.debug(
 5204                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5205            )
 5206
 5207        force_update_annotation = True
 5208
 5209        if annotations:
 5210
 5211            commands = []
 5212            tmp_annotates_vcf_name_list = []
 5213
 5214            # Export in VCF
 5215            log.debug("Create initial file to annotate")
 5216            tmp_vcf = NamedTemporaryFile(
 5217                prefix=self.get_prefix(),
 5218                dir=self.get_tmp_dir(),
 5219                suffix=".vcf.gz",
 5220                delete=False,
 5221            )
 5222            tmp_vcf_name = tmp_vcf.name
 5223            tmp_files.append(tmp_vcf_name)
 5224            tmp_files.append(tmp_vcf_name + ".tbi")
 5225
 5226            # Export VCF file
 5227            self.export_variant_vcf(
 5228                vcf_file=tmp_vcf_name,
 5229                remove_info=".",
 5230                add_samples=False,
 5231                index=True,
 5232            )
 5233
 5234            # Create file for field rename
 5235            log.debug("Create file for field rename")
 5236            tmp_rename = NamedTemporaryFile(
 5237                prefix=self.get_prefix(),
 5238                dir=self.get_tmp_dir(),
 5239                suffix=".rename",
 5240                delete=False,
 5241            )
 5242            tmp_rename_name = tmp_rename.name
 5243            tmp_files.append(tmp_rename_name)
 5244
 5245            # Check Annovar database
 5246            log.debug(
 5247                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5248            )
 5249            databases_download_annovar(
 5250                folder=annovar_databases,
 5251                files=list(annotations.keys()),
 5252                assemblies=[assembly],
 5253            )
 5254
 5255            for annotation in annotations:
 5256                annotation_fields = annotations[annotation]
 5257
 5258                if not annotation_fields:
 5259                    annotation_fields = {"INFO": None}
 5260
 5261                log.info(f"Annotations Annovar - database '{annotation}'")
 5262                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5263
 5264                # Tmp file for annovar
 5265                err_files = []
 5266                tmp_annotate_vcf_directory = TemporaryDirectory(
 5267                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5268                )
 5269                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5270                tmp_annotate_vcf_name_annovar = (
 5271                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5272                )
 5273                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5274                err_files.append(tmp_annotate_vcf_name_err)
 5275                tmp_files.append(tmp_annotate_vcf_name_err)
 5276
 5277                # Tmp file final vcf annotated by annovar
 5278                tmp_annotate_vcf = NamedTemporaryFile(
 5279                    prefix=self.get_prefix(),
 5280                    dir=self.get_tmp_dir(),
 5281                    suffix=".vcf.gz",
 5282                    delete=False,
 5283                )
 5284                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5285                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5286                tmp_files.append(tmp_annotate_vcf_name)
 5287                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5288
 5289                # Number of fields
 5290                annotation_list = []
 5291                annotation_renamed_list = []
 5292
 5293                for annotation_field in annotation_fields:
 5294
 5295                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5296                    annotation_fields_new_name = annotation_fields.get(
 5297                        annotation_field, annotation_field
 5298                    )
 5299                    if not annotation_fields_new_name:
 5300                        annotation_fields_new_name = annotation_field
 5301
 5302                    if (
 5303                        force_update_annotation
 5304                        or annotation_fields_new_name not in self.get_header().infos
 5305                    ):
 5306                        annotation_list.append(annotation_field)
 5307                        annotation_renamed_list.append(annotation_fields_new_name)
 5308                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5309                        log.warning(
 5310                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5311                        )
 5312
 5313                    # Add rename info
 5314                    run_parallel_commands(
 5315                        [
 5316                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5317                        ],
 5318                        1,
 5319                    )
 5320
 5321                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5322                log.debug("annotation_list: " + str(annotation_list))
 5323
 5324                # protocol
 5325                protocol = annotation
 5326
 5327                # argument
 5328                argument = ""
 5329
 5330                # operation
 5331                operation = "f"
 5332                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5333                    "ensGene"
 5334                ):
 5335                    operation = "g"
 5336                    if options.get("genebase", None):
 5337                        argument = f"""'{options.get("genebase","")}'"""
 5338                elif annotation in ["cytoBand"]:
 5339                    operation = "r"
 5340
 5341                # argument option
 5342                argument_option = ""
 5343                if argument != "":
 5344                    argument_option = " --argument " + argument
 5345
 5346                # command options
 5347                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5348                for option in options:
 5349                    if option not in ["genebase"]:
 5350                        command_options += f""" --{option}={options[option]}"""
 5351
 5352                # Command
 5353
 5354                # Command - Annovar
 5355                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5356                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5357
 5358                # Command - start pipe
 5359                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5360
 5361                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5362                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5363
 5364                # Command - Special characters (refGene annotation)
 5365                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5366
 5367                # Command - Clean empty fields (with value ".")
 5368                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5369
 5370                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5371                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5372                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5373                    # for ann in annotation_renamed_list:
 5374                    for ann in annotation_list:
 5375                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5376
 5377                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5378
 5379                # Command - indexing
 5380                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5381
 5382                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5383                run_parallel_commands([command_annovar], 1)
 5384
 5385                # Error messages
 5386                log.info(f"Error/Warning messages:")
 5387                error_message_command_all = []
 5388                error_message_command_warning = []
 5389                error_message_command_err = []
 5390                for err_file in err_files:
 5391                    with open(err_file, "r") as f:
 5392                        for line in f:
 5393                            message = line.strip()
 5394                            error_message_command_all.append(message)
 5395                            if line.startswith("[W::") or line.startswith("WARNING"):
 5396                                error_message_command_warning.append(message)
 5397                            if line.startswith("[E::") or line.startswith("ERROR"):
 5398                                error_message_command_err.append(
 5399                                    f"{err_file}: " + message
 5400                                )
 5401                # log info
 5402                for message in list(
 5403                    set(error_message_command_err + error_message_command_warning)
 5404                ):
 5405                    log.info(f"   {message}")
 5406                # debug info
 5407                for message in list(set(error_message_command_all)):
 5408                    log.debug(f"   {message}")
 5409                # failed
 5410                if len(error_message_command_err):
 5411                    log.error("Annotation failed: Error in commands")
 5412                    raise ValueError("Annotation failed: Error in commands")
 5413
 5414            if tmp_annotates_vcf_name_list:
 5415
 5416                # List of annotated files
 5417                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5418
 5419                # Tmp file
 5420                tmp_annotate_vcf = NamedTemporaryFile(
 5421                    prefix=self.get_prefix(),
 5422                    dir=self.get_tmp_dir(),
 5423                    suffix=".vcf.gz",
 5424                    delete=False,
 5425                )
 5426                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5427                tmp_files.append(tmp_annotate_vcf_name)
 5428                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5429                err_files.append(tmp_annotate_vcf_name_err)
 5430                tmp_files.append(tmp_annotate_vcf_name_err)
 5431
 5432                # Command merge
 5433                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5434                log.info(
 5435                    f"Annotation Annovar - Annotation merging "
 5436                    + str(len(tmp_annotates_vcf_name_list))
 5437                    + " annotated files"
 5438                )
 5439                log.debug(f"Annotation - merge command: {merge_command}")
 5440                run_parallel_commands([merge_command], 1)
 5441
 5442                # Find annotation in header
 5443                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5444                    header_list = self.read_vcf_header(f)
 5445                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5446
 5447                for ann in annovar_vcf_header.infos:
 5448                    if ann not in self.get_header().infos:
 5449                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5450
 5451                # Update variants
 5452                log.info(f"Annotation Annovar - Updating...")
 5453                self.update_from_vcf(tmp_annotate_vcf_name)
 5454
 5455            # Clean files
 5456            # Tmp file remove command
 5457            if True:
 5458                tmp_files_remove_command = ""
 5459                if tmp_files:
 5460                    tmp_files_remove_command = " ".join(tmp_files)
 5461                clean_command = f" rm -f {tmp_files_remove_command} "
 5462                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5463                log.debug(f"Annotation - cleaning command: {clean_command}")
 5464                run_parallel_commands([clean_command], 1)
 5465
 5466    # Parquet
 5467    def annotation_parquet(self, threads: int = None) -> None:
 5468        """
 5469        It takes a VCF file, and annotates it with a parquet file
 5470
 5471        :param threads: number of threads to use for the annotation
 5472        :return: the value of the variable "result".
 5473        """
 5474
 5475        # DEBUG
 5476        log.debug("Start annotation with parquet databases")
 5477
 5478        # Threads
 5479        if not threads:
 5480            threads = self.get_threads()
 5481        log.debug("Threads: " + str(threads))
 5482
 5483        # DEBUG
 5484        delete_tmp = True
 5485        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5486            delete_tmp = False
 5487            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5488
 5489        # Config
 5490        databases_folders = set(
 5491            self.get_config()
 5492            .get("folders", {})
 5493            .get("databases", {})
 5494            .get("annotations", ["."])
 5495            + self.get_config()
 5496            .get("folders", {})
 5497            .get("databases", {})
 5498            .get("parquet", ["."])
 5499        )
 5500        log.debug("Databases annotations: " + str(databases_folders))
 5501
 5502        # Param
 5503        annotations = (
 5504            self.get_param()
 5505            .get("annotation", {})
 5506            .get("parquet", {})
 5507            .get("annotations", None)
 5508        )
 5509        log.debug("Annotations: " + str(annotations))
 5510
 5511        # Assembly
 5512        assembly = self.get_param().get(
 5513            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5514        )
 5515
 5516        # Force Update Annotation
 5517        force_update_annotation = (
 5518            self.get_param()
 5519            .get("annotation", {})
 5520            .get("options", {})
 5521            .get("annotations_update", False)
 5522        )
 5523        log.debug(f"force_update_annotation={force_update_annotation}")
 5524        force_append_annotation = (
 5525            self.get_param()
 5526            .get("annotation", {})
 5527            .get("options", {})
 5528            .get("annotations_append", False)
 5529        )
 5530        log.debug(f"force_append_annotation={force_append_annotation}")
 5531
 5532        # Data
 5533        table_variants = self.get_table_variants()
 5534
 5535        # Check if not empty
 5536        log.debug("Check if not empty")
 5537        sql_query_chromosomes_df = self.get_query_to_df(
 5538            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5539        )
 5540        if not sql_query_chromosomes_df["count"][0]:
 5541            log.info(f"VCF empty")
 5542            return
 5543
 5544        # VCF header
 5545        vcf_reader = self.get_header()
 5546        log.debug("Initial header: " + str(vcf_reader.infos))
 5547
 5548        # Nb Variants POS
 5549        log.debug("NB Variants Start")
 5550        nb_variants = self.conn.execute(
 5551            f"SELECT count(*) AS count FROM variants"
 5552        ).fetchdf()["count"][0]
 5553        log.debug("NB Variants Stop")
 5554
 5555        # Existing annotations
 5556        for vcf_annotation in self.get_header().infos:
 5557
 5558            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5559            log.debug(
 5560                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5561            )
 5562
 5563        # Added columns
 5564        added_columns = []
 5565
 5566        # drop indexes
 5567        log.debug(f"Drop indexes...")
 5568        self.drop_indexes()
 5569
 5570        if annotations:
 5571
 5572            if "ALL" in annotations:
 5573
 5574                all_param = annotations.get("ALL", {})
 5575                all_param_formats = all_param.get("formats", None)
 5576                all_param_releases = all_param.get("releases", None)
 5577
 5578                databases_infos_dict = self.scan_databases(
 5579                    database_formats=all_param_formats,
 5580                    database_releases=all_param_releases,
 5581                )
 5582                for database_infos in databases_infos_dict.keys():
 5583                    if database_infos not in annotations:
 5584                        annotations[database_infos] = {"INFO": None}
 5585
 5586            for annotation in annotations:
 5587
 5588                if annotation in ["ALL"]:
 5589                    continue
 5590
 5591                # Annotation Name
 5592                annotation_name = os.path.basename(annotation)
 5593
 5594                # Annotation fields
 5595                annotation_fields = annotations[annotation]
 5596                if not annotation_fields:
 5597                    annotation_fields = {"INFO": None}
 5598
 5599                log.debug(f"Annotation '{annotation_name}'")
 5600                log.debug(
 5601                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5602                )
 5603
 5604                # Create Database
 5605                database = Database(
 5606                    database=annotation,
 5607                    databases_folders=databases_folders,
 5608                    assembly=assembly,
 5609                )
 5610
 5611                # Find files
 5612                parquet_file = database.get_database()
 5613                parquet_hdr_file = database.get_header_file()
 5614                parquet_type = database.get_type()
 5615
 5616                # Check if files exists
 5617                if not parquet_file or not parquet_hdr_file:
 5618                    log.error("Annotation failed: file not found")
 5619                    raise ValueError("Annotation failed: file not found")
 5620                else:
 5621                    # Get parquet connexion
 5622                    parquet_sql_attach = database.get_sql_database_attach(
 5623                        output="query"
 5624                    )
 5625                    if parquet_sql_attach:
 5626                        self.conn.execute(parquet_sql_attach)
 5627                    parquet_file_link = database.get_sql_database_link()
 5628                    # Log
 5629                    log.debug(
 5630                        f"Annotation '{annotation_name}' - file: "
 5631                        + str(parquet_file)
 5632                        + " and "
 5633                        + str(parquet_hdr_file)
 5634                    )
 5635
 5636                    # Database full header columns
 5637                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5638                        parquet_hdr_file
 5639                    )
 5640                    # Log
 5641                    log.debug(
 5642                        "Annotation database header columns : "
 5643                        + str(parquet_hdr_vcf_header_columns)
 5644                    )
 5645
 5646                    # Load header as VCF object
 5647                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5648                    # Log
 5649                    log.debug(
 5650                        "Annotation database header: "
 5651                        + str(parquet_hdr_vcf_header_infos)
 5652                    )
 5653
 5654                    # Get extra infos
 5655                    parquet_columns = database.get_extra_columns()
 5656                    # Log
 5657                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5658
 5659                    # Add extra columns if "ALL" in annotation_fields
 5660                    # if "ALL" in annotation_fields:
 5661                    #     allow_add_extra_column = True
 5662                    if "ALL" in annotation_fields and database.get_extra_columns():
 5663                        for extra_column in database.get_extra_columns():
 5664                            if (
 5665                                extra_column not in annotation_fields
 5666                                and extra_column.replace("INFO/", "")
 5667                                not in parquet_hdr_vcf_header_infos
 5668                            ):
 5669                                parquet_hdr_vcf_header_infos[extra_column] = (
 5670                                    vcf.parser._Info(
 5671                                        extra_column,
 5672                                        ".",
 5673                                        "String",
 5674                                        f"{extra_column} description",
 5675                                        "unknown",
 5676                                        "unknown",
 5677                                        self.code_type_map["String"],
 5678                                    )
 5679                                )
 5680
 5681                    # For all fields in database
 5682                    annotation_fields_all = False
 5683                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5684                        annotation_fields_all = True
 5685                        annotation_fields = {
 5686                            key: key for key in parquet_hdr_vcf_header_infos
 5687                        }
 5688
 5689                        log.debug(
 5690                            "Annotation database header - All annotations added: "
 5691                            + str(annotation_fields)
 5692                        )
 5693
 5694                    # Init
 5695
 5696                    # List of annotation fields to use
 5697                    sql_query_annotation_update_info_sets = []
 5698
 5699                    # List of annotation to agregate
 5700                    sql_query_annotation_to_agregate = []
 5701
 5702                    # Number of fields
 5703                    nb_annotation_field = 0
 5704
 5705                    # Annotation fields processed
 5706                    annotation_fields_processed = []
 5707
 5708                    # Columns mapping
 5709                    map_columns = database.map_columns(
 5710                        columns=annotation_fields, prefixes=["INFO/"]
 5711                    )
 5712
 5713                    # Query dict for fields to remove (update option)
 5714                    query_dict_remove = {}
 5715
 5716                    # Fetch Anotation fields
 5717                    for annotation_field in annotation_fields:
 5718
 5719                        # annotation_field_column
 5720                        annotation_field_column = map_columns.get(
 5721                            annotation_field, "INFO"
 5722                        )
 5723
 5724                        # field new name, if parametered
 5725                        annotation_fields_new_name = annotation_fields.get(
 5726                            annotation_field, annotation_field
 5727                        )
 5728                        if not annotation_fields_new_name:
 5729                            annotation_fields_new_name = annotation_field
 5730
 5731                        # To annotate
 5732                        # force_update_annotation = True
 5733                        # force_append_annotation = True
 5734                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5735                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5736                            force_update_annotation
 5737                            or force_append_annotation
 5738                            or (
 5739                                annotation_fields_new_name
 5740                                not in self.get_header().infos
 5741                            )
 5742                        ):
 5743
 5744                            # Add field to annotation to process list
 5745                            annotation_fields_processed.append(
 5746                                annotation_fields_new_name
 5747                            )
 5748
 5749                            # explode infos for the field
 5750                            annotation_fields_new_name_info_msg = ""
 5751                            if (
 5752                                force_update_annotation
 5753                                and annotation_fields_new_name
 5754                                in self.get_header().infos
 5755                            ):
 5756                                # Remove field from INFO
 5757                                query = f"""
 5758                                    UPDATE {table_variants} as table_variants
 5759                                    SET INFO = REGEXP_REPLACE(
 5760                                                concat(table_variants.INFO,''),
 5761                                                ';*{annotation_fields_new_name}=[^;]*',
 5762                                                ''
 5763                                                )
 5764                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5765                                """
 5766                                annotation_fields_new_name_info_msg = " [update]"
 5767                                query_dict_remove[
 5768                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5769                                ] = query
 5770
 5771                            # Sep between fields in INFO
 5772                            nb_annotation_field += 1
 5773                            if nb_annotation_field > 1:
 5774                                annotation_field_sep = ";"
 5775                            else:
 5776                                annotation_field_sep = ""
 5777
 5778                            log.info(
 5779                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5780                            )
 5781
 5782                            # Add INFO field to header
 5783                            parquet_hdr_vcf_header_infos_number = (
 5784                                parquet_hdr_vcf_header_infos[annotation_field].num
 5785                                or "."
 5786                            )
 5787                            parquet_hdr_vcf_header_infos_type = (
 5788                                parquet_hdr_vcf_header_infos[annotation_field].type
 5789                                or "String"
 5790                            )
 5791                            parquet_hdr_vcf_header_infos_description = (
 5792                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5793                                or f"{annotation_field} description"
 5794                            )
 5795                            parquet_hdr_vcf_header_infos_source = (
 5796                                parquet_hdr_vcf_header_infos[annotation_field].source
 5797                                or "unknown"
 5798                            )
 5799                            parquet_hdr_vcf_header_infos_version = (
 5800                                parquet_hdr_vcf_header_infos[annotation_field].version
 5801                                or "unknown"
 5802                            )
 5803
 5804                            vcf_reader.infos[annotation_fields_new_name] = (
 5805                                vcf.parser._Info(
 5806                                    annotation_fields_new_name,
 5807                                    parquet_hdr_vcf_header_infos_number,
 5808                                    parquet_hdr_vcf_header_infos_type,
 5809                                    parquet_hdr_vcf_header_infos_description,
 5810                                    parquet_hdr_vcf_header_infos_source,
 5811                                    parquet_hdr_vcf_header_infos_version,
 5812                                    self.code_type_map[
 5813                                        parquet_hdr_vcf_header_infos_type
 5814                                    ],
 5815                                )
 5816                            )
 5817
 5818                            # Append
 5819                            if force_append_annotation:
 5820                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5821                            else:
 5822                                query_case_when_append = ""
 5823
 5824                            # Annotation/Update query fields
 5825                            # Found in INFO column
 5826                            if (
 5827                                annotation_field_column == "INFO"
 5828                                and "INFO" in parquet_hdr_vcf_header_columns
 5829                            ):
 5830                                sql_query_annotation_update_info_sets.append(
 5831                                    f"""
 5832                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5833                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5834                                        ELSE ''
 5835                                    END
 5836                                """
 5837                                )
 5838                            # Found in a specific column
 5839                            else:
 5840                                sql_query_annotation_update_info_sets.append(
 5841                                    f"""
 5842                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 5843                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 5844                                        ELSE ''
 5845                                    END
 5846                                """
 5847                                )
 5848                                sql_query_annotation_to_agregate.append(
 5849                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5850                                )
 5851
 5852                        # Not to annotate
 5853                        else:
 5854
 5855                            if force_update_annotation:
 5856                                annotation_message = "forced"
 5857                            else:
 5858                                annotation_message = "skipped"
 5859
 5860                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5861                                log.warning(
 5862                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5863                                )
 5864                            if annotation_fields_new_name in self.get_header().infos:
 5865                                log.warning(
 5866                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5867                                )
 5868
 5869                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5870                    # allow_annotation_full_info = True
 5871                    allow_annotation_full_info = not force_append_annotation
 5872
 5873                    if parquet_type in ["regions"]:
 5874                        allow_annotation_full_info = False
 5875
 5876                    if (
 5877                        allow_annotation_full_info
 5878                        and nb_annotation_field == len(annotation_fields)
 5879                        and annotation_fields_all
 5880                        and (
 5881                            "INFO" in parquet_hdr_vcf_header_columns
 5882                            and "INFO" in database.get_extra_columns()
 5883                        )
 5884                    ):
 5885                        log.debug("Column INFO annotation enabled")
 5886                        sql_query_annotation_update_info_sets = []
 5887                        sql_query_annotation_update_info_sets.append(
 5888                            f" table_parquet.INFO "
 5889                        )
 5890
 5891                    if sql_query_annotation_update_info_sets:
 5892
 5893                        # Annotate
 5894                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5895
 5896                        # Join query annotation update info sets for SQL
 5897                        sql_query_annotation_update_info_sets_sql = ",".join(
 5898                            sql_query_annotation_update_info_sets
 5899                        )
 5900
 5901                        # Check chromosomes list (and variants infos)
 5902                        sql_query_chromosomes = f"""
 5903                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5904                            FROM {table_variants} as table_variants
 5905                            GROUP BY table_variants."#CHROM"
 5906                            ORDER BY table_variants."#CHROM"
 5907                            """
 5908                        sql_query_chromosomes_df = self.conn.execute(
 5909                            sql_query_chromosomes
 5910                        ).df()
 5911                        sql_query_chromosomes_dict = {
 5912                            entry["CHROM"]: {
 5913                                "count": entry["count_variants"],
 5914                                "min": entry["min_variants"],
 5915                                "max": entry["max_variants"],
 5916                            }
 5917                            for index, entry in sql_query_chromosomes_df.iterrows()
 5918                        }
 5919
 5920                        # Init
 5921                        nb_of_query = 0
 5922                        nb_of_variant_annotated = 0
 5923                        query_dict = query_dict_remove
 5924
 5925                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5926                        for chrom in sql_query_chromosomes_dict:
 5927
 5928                            # Number of variant by chromosome
 5929                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5930                                chrom, {}
 5931                            ).get("count", 0)
 5932
 5933                            log.debug(
 5934                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5935                            )
 5936
 5937                            # Annotation with regions database
 5938                            if parquet_type in ["regions"]:
 5939                                sql_query_annotation_from_clause = f"""
 5940                                    FROM (
 5941                                        SELECT 
 5942                                            '{chrom}' AS \"#CHROM\",
 5943                                            table_variants_from.\"POS\" AS \"POS\",
 5944                                            {",".join(sql_query_annotation_to_agregate)}
 5945                                        FROM {table_variants} as table_variants_from
 5946                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5947                                            table_parquet_from."#CHROM" = '{chrom}'
 5948                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5949                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5950                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5951                                                )
 5952                                        )
 5953                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5954                                        GROUP BY table_variants_from.\"POS\"
 5955                                        )
 5956                                        as table_parquet
 5957                                """
 5958
 5959                                sql_query_annotation_where_clause = """
 5960                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5961                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5962                                """
 5963
 5964                            # Annotation with variants database
 5965                            else:
 5966                                sql_query_annotation_from_clause = f"""
 5967                                    FROM {parquet_file_link} as table_parquet
 5968                                """
 5969                                sql_query_annotation_where_clause = f"""
 5970                                    table_variants."#CHROM" = '{chrom}'
 5971                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5972                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5973                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5974                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5975                                """
 5976
 5977                            # Create update query
 5978                            sql_query_annotation_chrom_interval_pos = f"""
 5979                                UPDATE {table_variants} as table_variants
 5980                                    SET INFO = 
 5981                                        concat(
 5982                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5983                                                THEN table_variants.INFO
 5984                                                ELSE ''
 5985                                            END
 5986                                            ,
 5987                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5988                                                        AND (
 5989                                                        concat({sql_query_annotation_update_info_sets_sql})
 5990                                                        )
 5991                                                        NOT IN ('','.') 
 5992                                                    THEN ';'
 5993                                                    ELSE ''
 5994                                            END
 5995                                            ,
 5996                                            {sql_query_annotation_update_info_sets_sql}
 5997                                            )
 5998                                    {sql_query_annotation_from_clause}
 5999                                    WHERE {sql_query_annotation_where_clause}
 6000                                    ;
 6001                                """
 6002
 6003                            # Add update query to dict
 6004                            query_dict[
 6005                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6006                            ] = sql_query_annotation_chrom_interval_pos
 6007
 6008                        nb_of_query = len(query_dict)
 6009                        num_query = 0
 6010
 6011                        # SET max_expression_depth TO x
 6012                        self.conn.execute("SET max_expression_depth TO 10000")
 6013
 6014                        for query_name in query_dict:
 6015                            query = query_dict[query_name]
 6016                            num_query += 1
 6017                            log.info(
 6018                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6019                            )
 6020                            result = self.conn.execute(query)
 6021                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6022                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6023                            log.info(
 6024                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6025                            )
 6026
 6027                        log.info(
 6028                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6029                        )
 6030
 6031                    else:
 6032
 6033                        log.info(
 6034                            f"Annotation '{annotation_name}' - No Annotations available"
 6035                        )
 6036
 6037                    log.debug("Final header: " + str(vcf_reader.infos))
 6038
 6039        # Remove added columns
 6040        for added_column in added_columns:
 6041            self.drop_column(column=added_column)
 6042
 6043    def annotation_splice(self, threads: int = None) -> None:
 6044        """
 6045        This function annotate with snpEff
 6046
 6047        :param threads: The number of threads to use
 6048        :return: the value of the variable "return_value".
 6049        """
 6050
 6051        # DEBUG
 6052        log.debug("Start annotation with splice tools")
 6053
 6054        # Threads
 6055        if not threads:
 6056            threads = self.get_threads()
 6057        log.debug("Threads: " + str(threads))
 6058
 6059        # DEBUG
 6060        delete_tmp = True
 6061        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6062            delete_tmp = False
 6063            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6064
 6065        # Config
 6066        config = self.get_config()
 6067        log.debug("Config: " + str(config))
 6068        splice_config = config.get("tools", {}).get("splice", {})
 6069        if not splice_config:
 6070            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6071        if not splice_config:
 6072            msg_err = "No Splice tool config"
 6073            log.error(msg_err)
 6074            raise ValueError(msg_err)
 6075        log.debug(f"splice_config={splice_config}")
 6076
 6077        # Config - Folders - Databases
 6078        databases_folders = (
 6079            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6080        )
 6081        log.debug("Databases annotations: " + str(databases_folders))
 6082
 6083        # Splice docker image
 6084        splice_docker_image = splice_config.get("docker").get("image")
 6085
 6086        # Pull splice image if it's not already there
 6087        if not check_docker_image_exists(splice_docker_image):
 6088            log.warning(
 6089                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6090            )
 6091            try:
 6092                command(f"docker pull {splice_config.get('docker').get('image')}")
 6093            except subprocess.CalledProcessError:
 6094                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6095                log.error(msg_err)
 6096                raise ValueError(msg_err)
 6097                return None
 6098
 6099        # Config - splice databases
 6100        splice_databases = (
 6101            config.get("folders", {})
 6102            .get("databases", {})
 6103            .get("splice", DEFAULT_SPLICE_FOLDER)
 6104        )
 6105        splice_databases = full_path(splice_databases)
 6106
 6107        # Param
 6108        param = self.get_param()
 6109        log.debug("Param: " + str(param))
 6110
 6111        # Param
 6112        options = param.get("annotation", {}).get("splice", {})
 6113        log.debug("Options: " + str(options))
 6114
 6115        # Data
 6116        table_variants = self.get_table_variants()
 6117
 6118        # Check if not empty
 6119        log.debug("Check if not empty")
 6120        sql_query_chromosomes = (
 6121            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6122        )
 6123        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6124            log.info("VCF empty")
 6125            return None
 6126
 6127        # Export in VCF
 6128        log.debug("Create initial file to annotate")
 6129
 6130        # Create output folder
 6131        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6132        if not os.path.exists(output_folder):
 6133            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6134
 6135        # Create tmp VCF file
 6136        tmp_vcf = NamedTemporaryFile(
 6137            prefix=self.get_prefix(),
 6138            dir=output_folder,
 6139            suffix=".vcf",
 6140            delete=False,
 6141        )
 6142        tmp_vcf_name = tmp_vcf.name
 6143
 6144        # VCF header
 6145        header = self.get_header()
 6146
 6147        # Existing annotations
 6148        for vcf_annotation in self.get_header().infos:
 6149
 6150            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6151            log.debug(
 6152                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6153            )
 6154
 6155        # Memory limit
 6156        if config.get("memory", None):
 6157            memory_limit = config.get("memory", "8G").upper()
 6158            # upper()
 6159        else:
 6160            memory_limit = "8G"
 6161        log.debug(f"memory_limit: {memory_limit}")
 6162
 6163        # Check number of variants to annotate
 6164        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6165        where_clause_regex_spip = r"SPiP_\w+"
 6166        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6167        df_list_of_variants_to_annotate = self.get_query_to_df(
 6168            query=f""" SELECT * FROM variants {where_clause} """
 6169        )
 6170        if len(df_list_of_variants_to_annotate) == 0:
 6171            log.warning(
 6172                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6173            )
 6174            return None
 6175        else:
 6176            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6177
 6178        # Export VCF file
 6179        self.export_variant_vcf(
 6180            vcf_file=tmp_vcf_name,
 6181            remove_info=True,
 6182            add_samples=True,
 6183            index=False,
 6184            where_clause=where_clause,
 6185        )
 6186
 6187        # Create docker container and launch splice analysis
 6188        if splice_config:
 6189
 6190            # Splice mount folders
 6191            mount_folders = splice_config.get("mount", {})
 6192
 6193            # Genome mount
 6194            mount_folders[
 6195                config.get("folders", {})
 6196                .get("databases", {})
 6197                .get("genomes", DEFAULT_GENOME_FOLDER)
 6198            ] = "ro"
 6199
 6200            # SpliceAI mount
 6201            mount_folders[
 6202                config.get("folders", {})
 6203                .get("databases", {})
 6204                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6205            ] = "ro"
 6206
 6207            # Genome mount
 6208            mount_folders[
 6209                config.get("folders", {})
 6210                .get("databases", {})
 6211                .get("spip", DEFAULT_SPIP_FOLDER)
 6212            ] = "ro"
 6213
 6214            # Mount folders
 6215            mount = []
 6216
 6217            # Config mount
 6218            mount = [
 6219                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6220                for path, mode in mount_folders.items()
 6221            ]
 6222
 6223            if any(value for value in splice_config.values() if value is None):
 6224                log.warning("At least one splice config parameter is empty")
 6225                return None
 6226
 6227            # Params in splice nf
 6228            def check_values(dico: dict):
 6229                """
 6230                Ensure parameters for NF splice pipeline
 6231                """
 6232                for key, val in dico.items():
 6233                    if key == "genome":
 6234                        if any(
 6235                            assemb in options.get("genome", {})
 6236                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6237                        ):
 6238                            yield f"--{key} hg19"
 6239                        elif any(
 6240                            assemb in options.get("genome", {})
 6241                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6242                        ):
 6243                            yield f"--{key} hg38"
 6244                    elif (
 6245                        (isinstance(val, str) and val)
 6246                        or isinstance(val, int)
 6247                        or isinstance(val, bool)
 6248                    ):
 6249                        yield f"--{key} {val}"
 6250
 6251            # Genome
 6252            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6253            options["genome"] = genome
 6254
 6255            # NF params
 6256            nf_params = []
 6257
 6258            # Add options
 6259            if options:
 6260                nf_params = list(check_values(options))
 6261                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6262            else:
 6263                log.debug("No NF params provided")
 6264
 6265            # Add threads
 6266            if "threads" not in options.keys():
 6267                nf_params.append(f"--threads {threads}")
 6268
 6269            # Genome path
 6270            genome_path = find_genome(
 6271                config.get("folders", {})
 6272                .get("databases", {})
 6273                .get("genomes", DEFAULT_GENOME_FOLDER),
 6274                file=f"{genome}.fa",
 6275            )
 6276            # Add genome path
 6277            if not genome_path:
 6278                raise ValueError(
 6279                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6280                )
 6281            else:
 6282                log.debug(f"Genome: {genome_path}")
 6283                nf_params.append(f"--genome_path {genome_path}")
 6284
 6285            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6286                """
 6287                Setting up updated databases for SPiP and SpliceAI
 6288                """
 6289
 6290                try:
 6291
 6292                    # SpliceAI assembly transcriptome
 6293                    spliceai_assembly = os.path.join(
 6294                        config.get("folders", {})
 6295                        .get("databases", {})
 6296                        .get("spliceai", {}),
 6297                        options.get("genome"),
 6298                        "transcriptome",
 6299                    )
 6300                    spip_assembly = options.get("genome")
 6301
 6302                    spip = find(
 6303                        f"transcriptome_{spip_assembly}.RData",
 6304                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6305                    )
 6306                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6307                    log.debug(f"SPiP annotations: {spip}")
 6308                    log.debug(f"SpliceAI annotations: {spliceai}")
 6309                    if spip and spliceai:
 6310                        return [
 6311                            f"--spip_transcriptome {spip}",
 6312                            f"--spliceai_annotations {spliceai}",
 6313                        ]
 6314                    else:
 6315                        # TODO crash and go on with basic annotations ?
 6316                        # raise ValueError(
 6317                        #     "Can't find splice databases in configuration EXIT"
 6318                        # )
 6319                        log.warning(
 6320                            "Can't find splice databases in configuration, use annotations file from image"
 6321                        )
 6322                except TypeError:
 6323                    log.warning(
 6324                        "Can't find splice databases in configuration, use annotations file from image"
 6325                    )
 6326                    return []
 6327
 6328            # Add options, check if transcriptome option have already beend provided
 6329            if (
 6330                "spip_transcriptome" not in nf_params
 6331                and "spliceai_transcriptome" not in nf_params
 6332            ):
 6333                splice_reference = splice_annotations(options, config)
 6334                if splice_reference:
 6335                    nf_params.extend(splice_reference)
 6336
 6337            nf_params.append(f"--output_folder {output_folder}")
 6338
 6339            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6340            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6341            log.debug(cmd)
 6342
 6343            splice_config["docker"]["command"] = cmd
 6344
 6345            docker_cmd = get_bin_command(
 6346                tool="splice",
 6347                bin_type="docker",
 6348                config=config,
 6349                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6350                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6351            )
 6352
 6353            # Docker debug
 6354            # if splice_config.get("rm_container"):
 6355            #     rm_container = "--rm"
 6356            # else:
 6357            #     rm_container = ""
 6358            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6359
 6360            log.debug(docker_cmd)
 6361            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6362            log.debug(res.stdout)
 6363            if res.stderr:
 6364                log.error(res.stderr)
 6365            res.check_returncode()
 6366        else:
 6367            log.warning(f"Splice tool configuration not found: {config}")
 6368
 6369        # Update variants
 6370        log.info("Annotation - Updating...")
 6371        # Test find output vcf
 6372        log.debug(
 6373            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6374        )
 6375        output_vcf = []
 6376        # Wrong folder to look in
 6377        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6378            if (
 6379                files
 6380                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6381            ):
 6382                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6383        # log.debug(os.listdir(options.get("output_folder")))
 6384        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6385        if not output_vcf:
 6386            log.debug(
 6387                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6388            )
 6389        else:
 6390            # Get new header from annotated vcf
 6391            log.debug(f"Initial header: {len(header.infos)} fields")
 6392            # Create new header with splice infos
 6393            new_vcf = Variants(input=output_vcf[0])
 6394            new_vcf_header = new_vcf.get_header().infos
 6395            for keys, infos in new_vcf_header.items():
 6396                if keys not in header.infos.keys():
 6397                    header.infos[keys] = infos
 6398            log.debug(f"New header: {len(header.infos)} fields")
 6399            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6400            self.update_from_vcf(output_vcf[0])
 6401
 6402        # Remove folder
 6403        remove_if_exists(output_folder)
 6404
 6405    ###
 6406    # Prioritization
 6407    ###
 6408
 6409    def get_config_default(self, name: str) -> dict:
 6410        """
 6411        The function `get_config_default` returns a dictionary containing default configurations for
 6412        various calculations and prioritizations.
 6413
 6414        :param name: The `get_config_default` function returns a dictionary containing default
 6415        configurations for different calculations and prioritizations. The `name` parameter is used to
 6416        specify which specific configuration to retrieve from the dictionary
 6417        :type name: str
 6418        :return: The function `get_config_default` returns a dictionary containing default configuration
 6419        settings for different calculations and prioritizations. The specific configuration settings are
 6420        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6421        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6422        returned. If there is no match, an empty dictionary is returned.
 6423        """
 6424
 6425        config_default = {
 6426            "calculations": {
 6427                "variant_chr_pos_alt_ref": {
 6428                    "type": "sql",
 6429                    "name": "variant_chr_pos_alt_ref",
 6430                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6431                    "available": False,
 6432                    "output_column_name": "variant_chr_pos_alt_ref",
 6433                    "output_column_type": "String",
 6434                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6435                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6436                    "operation_info": True,
 6437                },
 6438                "VARTYPE": {
 6439                    "type": "sql",
 6440                    "name": "VARTYPE",
 6441                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6442                    "available": True,
 6443                    "output_column_name": "VARTYPE",
 6444                    "output_column_type": "String",
 6445                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6446                    "operation_query": """
 6447                            CASE
 6448                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6449                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6450                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6451                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6452                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6453                                ELSE 'UNDEFINED'
 6454                            END
 6455                            """,
 6456                    "info_fields": ["SVTYPE"],
 6457                    "operation_info": True,
 6458                },
 6459                "snpeff_hgvs": {
 6460                    "type": "python",
 6461                    "name": "snpeff_hgvs",
 6462                    "description": "HGVS nomenclatures from snpEff annotation",
 6463                    "available": True,
 6464                    "function_name": "calculation_extract_snpeff_hgvs",
 6465                    "function_params": ["snpeff_hgvs", "ANN"],
 6466                },
 6467                "snpeff_ann_explode": {
 6468                    "type": "python",
 6469                    "name": "snpeff_ann_explode",
 6470                    "description": "Explode snpEff annotations with uniquify values",
 6471                    "available": True,
 6472                    "function_name": "calculation_snpeff_ann_explode",
 6473                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6474                },
 6475                "snpeff_ann_explode_uniquify": {
 6476                    "type": "python",
 6477                    "name": "snpeff_ann_explode_uniquify",
 6478                    "description": "Explode snpEff annotations",
 6479                    "available": True,
 6480                    "function_name": "calculation_snpeff_ann_explode",
 6481                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6482                },
 6483                "snpeff_ann_explode_json": {
 6484                    "type": "python",
 6485                    "name": "snpeff_ann_explode_json",
 6486                    "description": "Explode snpEff annotations in JSON format",
 6487                    "available": True,
 6488                    "function_name": "calculation_snpeff_ann_explode",
 6489                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6490                },
 6491                "NOMEN": {
 6492                    "type": "python",
 6493                    "name": "NOMEN",
 6494                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6495                    "available": True,
 6496                    "function_name": "calculation_extract_nomen",
 6497                    "function_params": [],
 6498                },
 6499                "FINDBYPIPELINE": {
 6500                    "type": "python",
 6501                    "name": "FINDBYPIPELINE",
 6502                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6503                    "available": True,
 6504                    "function_name": "calculation_find_by_pipeline",
 6505                    "function_params": ["findbypipeline"],
 6506                },
 6507                "FINDBYSAMPLE": {
 6508                    "type": "python",
 6509                    "name": "FINDBYSAMPLE",
 6510                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6511                    "available": True,
 6512                    "function_name": "calculation_find_by_pipeline",
 6513                    "function_params": ["findbysample"],
 6514                },
 6515                "GENOTYPECONCORDANCE": {
 6516                    "type": "python",
 6517                    "name": "GENOTYPECONCORDANCE",
 6518                    "description": "Concordance of genotype for multi caller VCF",
 6519                    "available": True,
 6520                    "function_name": "calculation_genotype_concordance",
 6521                    "function_params": [],
 6522                },
 6523                "BARCODE": {
 6524                    "type": "python",
 6525                    "name": "BARCODE",
 6526                    "description": "BARCODE as VaRank tool",
 6527                    "available": True,
 6528                    "function_name": "calculation_barcode",
 6529                    "function_params": [],
 6530                },
 6531                "BARCODEFAMILY": {
 6532                    "type": "python",
 6533                    "name": "BARCODEFAMILY",
 6534                    "description": "BARCODEFAMILY as VaRank tool",
 6535                    "available": True,
 6536                    "function_name": "calculation_barcode_family",
 6537                    "function_params": ["BCF"],
 6538                },
 6539                "TRIO": {
 6540                    "type": "python",
 6541                    "name": "TRIO",
 6542                    "description": "Inheritance for a trio family",
 6543                    "available": True,
 6544                    "function_name": "calculation_trio",
 6545                    "function_params": [],
 6546                },
 6547                "VAF": {
 6548                    "type": "python",
 6549                    "name": "VAF",
 6550                    "description": "Variant Allele Frequency (VAF) harmonization",
 6551                    "available": True,
 6552                    "function_name": "calculation_vaf_normalization",
 6553                    "function_params": [],
 6554                },
 6555                "VAF_stats": {
 6556                    "type": "python",
 6557                    "name": "VAF_stats",
 6558                    "description": "Variant Allele Frequency (VAF) statistics",
 6559                    "available": True,
 6560                    "function_name": "calculation_genotype_stats",
 6561                    "function_params": ["VAF"],
 6562                },
 6563                "DP_stats": {
 6564                    "type": "python",
 6565                    "name": "DP_stats",
 6566                    "description": "Depth (DP) statistics",
 6567                    "available": True,
 6568                    "function_name": "calculation_genotype_stats",
 6569                    "function_params": ["DP"],
 6570                },
 6571                "variant_id": {
 6572                    "type": "python",
 6573                    "name": "variant_id",
 6574                    "description": "Variant ID generated from variant position and type",
 6575                    "available": True,
 6576                    "function_name": "calculation_variant_id",
 6577                    "function_params": [],
 6578                },
 6579                "transcripts_json": {
 6580                    "type": "python",
 6581                    "name": "transcripts_json",
 6582                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6583                    "available": True,
 6584                    "function_name": "calculation_transcripts_annotation",
 6585                    "function_params": ["transcripts_json", None],
 6586                },
 6587                "transcripts_ann": {
 6588                    "type": "python",
 6589                    "name": "transcripts_ann",
 6590                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6591                    "available": True,
 6592                    "function_name": "calculation_transcripts_annotation",
 6593                    "function_params": [None, "transcripts_ann"],
 6594                },
 6595                "transcripts_annotations": {
 6596                    "type": "python",
 6597                    "name": "transcripts_annotations",
 6598                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6599                    "available": True,
 6600                    "function_name": "calculation_transcripts_annotation",
 6601                    "function_params": [None, None],
 6602                },
 6603                "transcripts_prioritization": {
 6604                    "type": "python",
 6605                    "name": "transcripts_prioritization",
 6606                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6607                    "available": True,
 6608                    "function_name": "calculation_transcripts_prioritization",
 6609                    "function_params": [],
 6610                },
 6611            },
 6612            "prioritizations": {
 6613                "default": {
 6614                    "ANN2": [
 6615                        {
 6616                            "type": "contains",
 6617                            "value": "HIGH",
 6618                            "score": 5,
 6619                            "flag": "PASS",
 6620                            "comment": [
 6621                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6622                            ],
 6623                        },
 6624                        {
 6625                            "type": "contains",
 6626                            "value": "MODERATE",
 6627                            "score": 3,
 6628                            "flag": "PASS",
 6629                            "comment": [
 6630                                "A non-disruptive variant that might change protein effectiveness"
 6631                            ],
 6632                        },
 6633                        {
 6634                            "type": "contains",
 6635                            "value": "LOW",
 6636                            "score": 0,
 6637                            "flag": "FILTERED",
 6638                            "comment": [
 6639                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6640                            ],
 6641                        },
 6642                        {
 6643                            "type": "contains",
 6644                            "value": "MODIFIER",
 6645                            "score": 0,
 6646                            "flag": "FILTERED",
 6647                            "comment": [
 6648                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6649                            ],
 6650                        },
 6651                    ],
 6652                }
 6653            },
 6654        }
 6655
 6656        return config_default.get(name, None)
 6657
 6658    def get_config_json(
 6659        self, name: str, config_dict: dict = {}, config_file: str = None
 6660    ) -> dict:
 6661        """
 6662        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6663        default values, a dictionary, and a file.
 6664
 6665        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6666        the name of the configuration. It is used to identify and retrieve the configuration settings
 6667        for a specific component or module
 6668        :type name: str
 6669        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6670        dictionary that allows you to provide additional configuration settings or overrides. When you
 6671        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6672        the key is the configuration setting you want to override or
 6673        :type config_dict: dict
 6674        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6675        specify the path to a configuration file that contains additional settings. If provided, the
 6676        function will read the contents of this file and update the configuration dictionary with the
 6677        values found in the file, overriding any existing values with the
 6678        :type config_file: str
 6679        :return: The function `get_config_json` returns a dictionary containing the configuration
 6680        settings.
 6681        """
 6682
 6683        # Create with default prioritizations
 6684        config_default = self.get_config_default(name=name)
 6685        configuration = config_default
 6686        # log.debug(f"configuration={configuration}")
 6687
 6688        # Replace prioritizations from dict
 6689        for config in config_dict:
 6690            configuration[config] = config_dict[config]
 6691
 6692        # Replace prioritizations from file
 6693        config_file = full_path(config_file)
 6694        if config_file:
 6695            if os.path.exists(config_file):
 6696                with open(config_file) as config_file_content:
 6697                    config_file_dict = json.load(config_file_content)
 6698                for config in config_file_dict:
 6699                    configuration[config] = config_file_dict[config]
 6700            else:
 6701                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6702                log.error(msg_error)
 6703                raise ValueError(msg_error)
 6704
 6705        return configuration
 6706
 6707    def prioritization(
 6708        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6709    ) -> bool:
 6710        """
 6711        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6712        prioritizes variants based on configured profiles and criteria.
 6713
 6714        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6715        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6716        a table name is provided, the method will prioritize the variants in that specific table
 6717        :type table: str
 6718        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6719        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6720        provided, the code will use a default prefix value of "PZ"
 6721        :type pz_prefix: str
 6722        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6723        additional parameters specific to the prioritization process. These parameters can include
 6724        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6725        configurations needed for the prioritization of variants in a V
 6726        :type pz_param: dict
 6727        :return: A boolean value (True) is being returned from the `prioritization` function.
 6728        """
 6729
 6730        # Config
 6731        config = self.get_config()
 6732
 6733        # Param
 6734        param = self.get_param()
 6735
 6736        # Prioritization param
 6737        if pz_param is not None:
 6738            prioritization_param = pz_param
 6739        else:
 6740            prioritization_param = param.get("prioritization", {})
 6741
 6742        # Configuration profiles
 6743        prioritization_config_file = prioritization_param.get(
 6744            "prioritization_config", None
 6745        )
 6746        prioritization_config_file = full_path(prioritization_config_file)
 6747        prioritizations_config = self.get_config_json(
 6748            name="prioritizations", config_file=prioritization_config_file
 6749        )
 6750
 6751        # Prioritization prefix
 6752        pz_prefix_default = "PZ"
 6753        if pz_prefix is None:
 6754            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6755
 6756        # Prioritization options
 6757        profiles = prioritization_param.get("profiles", [])
 6758        if isinstance(profiles, str):
 6759            profiles = profiles.split(",")
 6760        pzfields = prioritization_param.get(
 6761            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6762        )
 6763        if isinstance(pzfields, str):
 6764            pzfields = pzfields.split(",")
 6765        default_profile = prioritization_param.get("default_profile", None)
 6766        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6767        prioritization_score_mode = prioritization_param.get(
 6768            "prioritization_score_mode", "HOWARD"
 6769        )
 6770
 6771        # Quick Prioritizations
 6772        prioritizations = param.get("prioritizations", None)
 6773        if prioritizations:
 6774            log.info("Quick Prioritization:")
 6775            for profile in prioritizations.split(","):
 6776                if profile not in profiles:
 6777                    profiles.append(profile)
 6778                    log.info(f"   {profile}")
 6779
 6780        # If profile "ALL" provided, all profiles in the config profiles
 6781        if "ALL" in profiles:
 6782            profiles = list(prioritizations_config.keys())
 6783
 6784        for profile in profiles:
 6785            if prioritizations_config.get(profile, None):
 6786                log.debug(f"Profile '{profile}' configured")
 6787            else:
 6788                msg_error = f"Profile '{profile}' NOT configured"
 6789                log.error(msg_error)
 6790                raise ValueError(msg_error)
 6791
 6792        if profiles:
 6793            log.info(f"Prioritization... ")
 6794        else:
 6795            log.debug(f"No profile defined")
 6796            return False
 6797
 6798        if not default_profile and len(profiles):
 6799            default_profile = profiles[0]
 6800
 6801        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6802        log.debug("Profiles to check: " + str(list(profiles)))
 6803
 6804        # Variables
 6805        if table is not None:
 6806            table_variants = table
 6807        else:
 6808            table_variants = self.get_table_variants(clause="update")
 6809        log.debug(f"Table to prioritize: {table_variants}")
 6810
 6811        # Added columns
 6812        added_columns = []
 6813
 6814        # Create list of PZfields
 6815        # List of PZFields
 6816        list_of_pzfields_original = pzfields + [
 6817            pzfield + pzfields_sep + profile
 6818            for pzfield in pzfields
 6819            for profile in profiles
 6820        ]
 6821        list_of_pzfields = []
 6822        log.debug(f"{list_of_pzfields_original}")
 6823
 6824        # Remove existing PZfields to use if exists
 6825        for pzfield in list_of_pzfields_original:
 6826            if self.get_header().infos.get(pzfield, None) is None:
 6827                list_of_pzfields.append(pzfield)
 6828                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6829            else:
 6830                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6831
 6832        if list_of_pzfields:
 6833
 6834            # Explode Infos prefix
 6835            explode_infos_prefix = self.get_explode_infos_prefix()
 6836
 6837            # PZfields tags description
 6838            PZfields_INFOS = {
 6839                f"{pz_prefix}Tags": {
 6840                    "ID": f"{pz_prefix}Tags",
 6841                    "Number": ".",
 6842                    "Type": "String",
 6843                    "Description": "Variant tags based on annotation criteria",
 6844                },
 6845                f"{pz_prefix}Score": {
 6846                    "ID": f"{pz_prefix}Score",
 6847                    "Number": 1,
 6848                    "Type": "Integer",
 6849                    "Description": "Variant score based on annotation criteria",
 6850                },
 6851                f"{pz_prefix}Flag": {
 6852                    "ID": f"{pz_prefix}Flag",
 6853                    "Number": 1,
 6854                    "Type": "String",
 6855                    "Description": "Variant flag based on annotation criteria",
 6856                },
 6857                f"{pz_prefix}Comment": {
 6858                    "ID": f"{pz_prefix}Comment",
 6859                    "Number": ".",
 6860                    "Type": "String",
 6861                    "Description": "Variant comment based on annotation criteria",
 6862                },
 6863                f"{pz_prefix}Infos": {
 6864                    "ID": f"{pz_prefix}Infos",
 6865                    "Number": ".",
 6866                    "Type": "String",
 6867                    "Description": "Variant infos based on annotation criteria",
 6868                },
 6869                f"{pz_prefix}Class": {
 6870                    "ID": f"{pz_prefix}Class",
 6871                    "Number": ".",
 6872                    "Type": "String",
 6873                    "Description": "Variant class based on annotation criteria",
 6874                },
 6875            }
 6876
 6877            # Create INFO fields if not exist
 6878            for field in PZfields_INFOS:
 6879                field_ID = PZfields_INFOS[field]["ID"]
 6880                field_description = PZfields_INFOS[field]["Description"]
 6881                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6882                    field_description = (
 6883                        PZfields_INFOS[field]["Description"]
 6884                        + f", profile {default_profile}"
 6885                    )
 6886                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6887                        field_ID,
 6888                        PZfields_INFOS[field]["Number"],
 6889                        PZfields_INFOS[field]["Type"],
 6890                        field_description,
 6891                        "unknown",
 6892                        "unknown",
 6893                        code_type_map[PZfields_INFOS[field]["Type"]],
 6894                    )
 6895
 6896            # Create INFO fields if not exist for each profile
 6897            for profile in prioritizations_config:
 6898                if profile in profiles or profiles == []:
 6899                    for field in PZfields_INFOS:
 6900                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6901                        field_description = (
 6902                            PZfields_INFOS[field]["Description"]
 6903                            + f", profile {profile}"
 6904                        )
 6905                        if (
 6906                            field_ID not in self.get_header().infos
 6907                            and field in pzfields
 6908                        ):
 6909                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6910                                field_ID,
 6911                                PZfields_INFOS[field]["Number"],
 6912                                PZfields_INFOS[field]["Type"],
 6913                                field_description,
 6914                                "unknown",
 6915                                "unknown",
 6916                                code_type_map[PZfields_INFOS[field]["Type"]],
 6917                            )
 6918
 6919            # Header
 6920            for pzfield in list_of_pzfields:
 6921                if re.match(f"{pz_prefix}Score.*", pzfield):
 6922                    added_column = self.add_column(
 6923                        table_name=table_variants,
 6924                        column_name=pzfield,
 6925                        column_type="INTEGER",
 6926                        default_value="0",
 6927                    )
 6928                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6929                    added_column = self.add_column(
 6930                        table_name=table_variants,
 6931                        column_name=pzfield,
 6932                        column_type="BOOLEAN",
 6933                        default_value="1",
 6934                    )
 6935                elif re.match(f"{pz_prefix}Class.*", pzfield):
 6936                    added_column = self.add_column(
 6937                        table_name=table_variants,
 6938                        column_name=pzfield,
 6939                        column_type="VARCHAR[]",
 6940                        default_value="null",
 6941                    )
 6942                else:
 6943                    added_column = self.add_column(
 6944                        table_name=table_variants,
 6945                        column_name=pzfield,
 6946                        column_type="STRING",
 6947                        default_value="''",
 6948                    )
 6949                added_columns.append(added_column)
 6950
 6951            # Profiles
 6952            if profiles:
 6953
 6954                # foreach profile in configuration file
 6955                for profile in prioritizations_config:
 6956
 6957                    # If profile is asked in param, or ALL are asked (empty profile [])
 6958                    if profile in profiles or profiles == []:
 6959                        log.info(f"Profile '{profile}'")
 6960
 6961                        sql_set_info_option = ""
 6962
 6963                        sql_set_info = []
 6964
 6965                        # PZ fields set
 6966
 6967                        # PZScore
 6968                        if (
 6969                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6970                            in list_of_pzfields
 6971                        ):
 6972                            sql_set_info.append(
 6973                                f"""
 6974                                    concat(
 6975                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6976                                        {pz_prefix}Score{pzfields_sep}{profile}
 6977                                    ) 
 6978                                """
 6979                            )
 6980                            if (
 6981                                profile == default_profile
 6982                                and f"{pz_prefix}Score" in list_of_pzfields
 6983                            ):
 6984                                sql_set_info.append(
 6985                                    f"""
 6986                                        concat(
 6987                                            '{pz_prefix}Score=',
 6988                                            {pz_prefix}Score{pzfields_sep}{profile}
 6989                                        )
 6990                                    """
 6991                                )
 6992
 6993                        # PZFlag
 6994                        if (
 6995                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6996                            in list_of_pzfields
 6997                        ):
 6998                            sql_set_info.append(
 6999                                f"""
 7000                                    concat(
 7001                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 7002                                        CASE 
 7003                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7004                                            THEN 'PASS'
 7005                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7006                                            THEN 'FILTERED'
 7007                                        END
 7008                                    ) 
 7009                                """
 7010                            )
 7011                            if (
 7012                                profile == default_profile
 7013                                and f"{pz_prefix}Flag" in list_of_pzfields
 7014                            ):
 7015                                sql_set_info.append(
 7016                                    f"""
 7017                                        concat(
 7018                                            '{pz_prefix}Flag=',
 7019                                            CASE 
 7020                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7021                                                THEN 'PASS'
 7022                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7023                                                THEN 'FILTERED'
 7024                                            END
 7025                                        )
 7026                                    """
 7027                                )
 7028
 7029                        # PZClass
 7030                        if (
 7031                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7032                            in list_of_pzfields
 7033                        ):
 7034                            sql_set_info.append(
 7035                                f"""
 7036                                    concat(
 7037                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7038                                        CASE
 7039                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7040                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7041                                            ELSE '.'
 7042                                        END 
 7043                                    )
 7044                                    
 7045                                """
 7046                            )
 7047                            if (
 7048                                profile == default_profile
 7049                                and f"{pz_prefix}Class" in list_of_pzfields
 7050                            ):
 7051                                sql_set_info.append(
 7052                                    f"""
 7053                                        concat(
 7054                                            '{pz_prefix}Class=',
 7055                                            CASE
 7056                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7057                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7058                                                ELSE '.'
 7059                                            END 
 7060                                        )
 7061                                    """
 7062                                )
 7063
 7064                        # PZComment
 7065                        if (
 7066                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7067                            in list_of_pzfields
 7068                        ):
 7069                            sql_set_info.append(
 7070                                f"""
 7071                                    CASE
 7072                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7073                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7074                                        ELSE ''
 7075                                    END
 7076                                """
 7077                            )
 7078                            if (
 7079                                profile == default_profile
 7080                                and f"{pz_prefix}Comment" in list_of_pzfields
 7081                            ):
 7082                                sql_set_info.append(
 7083                                    f"""
 7084                                        CASE
 7085                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7086                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7087                                            ELSE ''
 7088                                        END
 7089                                    """
 7090                                )
 7091
 7092                        # PZInfos
 7093                        if (
 7094                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7095                            in list_of_pzfields
 7096                        ):
 7097                            sql_set_info.append(
 7098                                f"""
 7099                                    CASE
 7100                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7101                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7102                                        ELSE ''
 7103                                    END
 7104                                """
 7105                            )
 7106                            if (
 7107                                profile == default_profile
 7108                                and f"{pz_prefix}Infos" in list_of_pzfields
 7109                            ):
 7110                                sql_set_info.append(
 7111                                    f"""
 7112                                        CASE
 7113                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7114                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7115                                            ELSE ''
 7116                                        END
 7117                                    """
 7118                                )
 7119
 7120                        # Merge PZfields
 7121                        sql_set_info_option = ""
 7122                        sql_set_sep = ""
 7123                        for sql_set in sql_set_info:
 7124                            if sql_set_sep:
 7125                                sql_set_info_option += f"""
 7126                                    , concat('{sql_set_sep}', {sql_set})
 7127                                """
 7128                            else:
 7129                                sql_set_info_option += f"""
 7130                                    , {sql_set}
 7131                                """
 7132                            sql_set_sep = ";"
 7133
 7134                        sql_queries = []
 7135                        for annotation in prioritizations_config[profile]:
 7136
 7137                            # skip special sections
 7138                            if annotation.startswith("_"):
 7139                                continue
 7140
 7141                            # For each criterions
 7142                            for criterion in prioritizations_config[profile][
 7143                                annotation
 7144                            ]:
 7145
 7146                                # Criterion mode
 7147                                criterion_mode = None
 7148                                if np.any(
 7149                                    np.isin(list(criterion.keys()), ["type", "value"])
 7150                                ):
 7151                                    criterion_mode = "operation"
 7152                                elif np.any(
 7153                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7154                                ):
 7155                                    criterion_mode = "sql"
 7156                                log.debug(f"Criterion Mode: {criterion_mode}")
 7157
 7158                                # Criterion parameters
 7159                                criterion_type = criterion.get("type", None)
 7160                                criterion_value = criterion.get("value", None)
 7161                                criterion_sql = criterion.get("sql", None)
 7162                                criterion_fields = criterion.get("fields", None)
 7163                                criterion_score = criterion.get("score", 0)
 7164                                criterion_flag = criterion.get("flag", "PASS")
 7165                                criterion_class = criterion.get("class", None)
 7166                                criterion_flag_bool = criterion_flag == "PASS"
 7167                                criterion_comment = (
 7168                                    ", ".join(criterion.get("comment", []))
 7169                                    .replace("'", "''")
 7170                                    .replace(";", ",")
 7171                                    .replace("\t", " ")
 7172                                )
 7173                                criterion_infos = (
 7174                                    str(criterion)
 7175                                    .replace("'", "''")
 7176                                    .replace(";", ",")
 7177                                    .replace("\t", " ")
 7178                                )
 7179
 7180                                # SQL
 7181                                if criterion_sql is not None and isinstance(
 7182                                    criterion_sql, list
 7183                                ):
 7184                                    criterion_sql = " ".join(criterion_sql)
 7185
 7186                                # Fields and explode
 7187                                if criterion_fields is None:
 7188                                    criterion_fields = [annotation]
 7189                                if not isinstance(criterion_fields, list):
 7190                                    criterion_fields = str(criterion_fields).split(",")
 7191
 7192                                # Class
 7193                                if criterion_class is not None and not isinstance(
 7194                                    criterion_class, list
 7195                                ):
 7196                                    criterion_class = str(criterion_class).split(",")
 7197
 7198                                for annotation_field in criterion_fields:
 7199
 7200                                    # Explode specific annotation
 7201                                    log.debug(
 7202                                        f"Explode annotation '{annotation_field}'"
 7203                                    )
 7204                                    added_columns += self.explode_infos(
 7205                                        prefix=explode_infos_prefix,
 7206                                        fields=[annotation_field],
 7207                                        table=table_variants,
 7208                                    )
 7209                                    extra_infos = self.get_extra_infos(
 7210                                        table=table_variants
 7211                                    )
 7212
 7213                                    # Check if annotation field is present
 7214                                    if (
 7215                                        f"{explode_infos_prefix}{annotation_field}"
 7216                                        not in extra_infos
 7217                                    ):
 7218                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7219                                        log.error(msq_err)
 7220                                        raise ValueError(msq_err)
 7221                                    else:
 7222                                        log.debug(
 7223                                            f"Annotation '{annotation_field}' in data"
 7224                                        )
 7225
 7226                                sql_set = []
 7227                                sql_set_info = []
 7228
 7229                                # PZ fields set
 7230
 7231                                # PZScore
 7232                                if (
 7233                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7234                                    in list_of_pzfields
 7235                                ):
 7236                                    # if prioritization_score_mode == "HOWARD":
 7237                                    #     sql_set.append(
 7238                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7239                                    #     )
 7240                                    # VaRank prioritization score mode
 7241                                    if prioritization_score_mode == "VaRank":
 7242                                        sql_set.append(
 7243                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7244                                        )
 7245                                    # default HOWARD prioritization score mode
 7246                                    else:
 7247                                        sql_set.append(
 7248                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7249                                        )
 7250
 7251                                # PZFlag
 7252                                if (
 7253                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7254                                    in list_of_pzfields
 7255                                ):
 7256                                    sql_set.append(
 7257                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7258                                    )
 7259
 7260                                # PZClass
 7261                                if (
 7262                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7263                                    in list_of_pzfields
 7264                                    and criterion_class is not None
 7265                                ):
 7266                                    sql_set.append(
 7267                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7268                                    )
 7269
 7270                                # PZComment
 7271                                if (
 7272                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7273                                    in list_of_pzfields
 7274                                ):
 7275                                    sql_set.append(
 7276                                        f"""
 7277                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7278                                                concat(
 7279                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7280                                                    CASE 
 7281                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7282                                                        THEN ', '
 7283                                                        ELSE ''
 7284                                                    END,
 7285                                                    '{criterion_comment}'
 7286                                                )
 7287                                        """
 7288                                    )
 7289
 7290                                # PZInfos
 7291                                if (
 7292                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7293                                    in list_of_pzfields
 7294                                ):
 7295                                    sql_set.append(
 7296                                        f"""
 7297                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7298                                                concat(
 7299                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7300                                                    '{criterion_infos}'
 7301                                                )
 7302                                        """
 7303                                    )
 7304                                sql_set_option = ",".join(sql_set)
 7305
 7306                                # Criterion and comparison
 7307                                if sql_set_option:
 7308
 7309                                    if criterion_mode in ["operation"]:
 7310
 7311                                        try:
 7312                                            float(criterion_value)
 7313                                            sql_update = f"""
 7314                                                UPDATE {table_variants}
 7315                                                SET {sql_set_option}
 7316                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7317                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7318                                            """
 7319                                        except:
 7320                                            contains_option = ""
 7321                                            if criterion_type == "contains":
 7322                                                contains_option = ".*"
 7323                                            sql_update = f"""
 7324                                                UPDATE {table_variants}
 7325                                                SET {sql_set_option}
 7326                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7327                                            """
 7328                                        sql_queries.append(sql_update)
 7329
 7330                                    elif criterion_mode in ["sql"]:
 7331
 7332                                        sql_update = f"""
 7333                                            UPDATE {table_variants}
 7334                                            SET {sql_set_option}
 7335                                            WHERE {criterion_sql}
 7336                                        """
 7337                                        sql_queries.append(sql_update)
 7338
 7339                                    else:
 7340                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7341                                        log.error(msg_err)
 7342                                        raise ValueError(msg_err)
 7343
 7344                                else:
 7345                                    log.warning(
 7346                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7347                                    )
 7348
 7349                        # PZTags
 7350                        if (
 7351                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7352                            in list_of_pzfields
 7353                        ):
 7354
 7355                            # Create PZFalgs value
 7356                            pztags_value = ""
 7357                            pztags_sep_default = ","
 7358                            pztags_sep = ""
 7359                            for pzfield in pzfields:
 7360                                if pzfield not in [f"{pz_prefix}Tags"]:
 7361                                    if (
 7362                                        f"{pzfield}{pzfields_sep}{profile}"
 7363                                        in list_of_pzfields
 7364                                    ):
 7365                                        if pzfield in [f"{pz_prefix}Flag"]:
 7366                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7367                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7368                                                    THEN 'PASS'
 7369                                                    ELSE 'FILTERED'
 7370                                                END, '"""
 7371                                        elif pzfield in [f"{pz_prefix}Class"]:
 7372                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7373                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7374                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7375                                                    ELSE '.'
 7376                                                END, '"""
 7377                                        else:
 7378                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7379                                        pztags_sep = pztags_sep_default
 7380
 7381                            # Add Query update for PZFlags
 7382                            sql_update_pztags = f"""
 7383                                UPDATE {table_variants}
 7384                                SET INFO = concat(
 7385                                        INFO,
 7386                                        CASE WHEN INFO NOT in ('','.')
 7387                                                THEN ';'
 7388                                                ELSE ''
 7389                                        END,
 7390                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7391                                    )
 7392                                """
 7393                            sql_queries.append(sql_update_pztags)
 7394
 7395                            # Add Query update for PZFlags for default
 7396                            if profile == default_profile:
 7397                                sql_update_pztags_default = f"""
 7398                                UPDATE {table_variants}
 7399                                SET INFO = concat(
 7400                                        INFO,
 7401                                        ';',
 7402                                        '{pz_prefix}Tags={pztags_value}'
 7403                                    )
 7404                                """
 7405                                sql_queries.append(sql_update_pztags_default)
 7406
 7407                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7408
 7409                        if sql_queries:
 7410
 7411                            for sql_query in sql_queries:
 7412                                log.debug(
 7413                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7414                                )
 7415                                self.conn.execute(sql_query)
 7416
 7417                        log.info(f"""Profile '{profile}' - Update... """)
 7418                        sql_query_update = f"""
 7419                            UPDATE {table_variants}
 7420                            SET INFO =  
 7421                                concat(
 7422                                    CASE
 7423                                        WHEN INFO NOT IN ('','.')
 7424                                        THEN concat(INFO, ';')
 7425                                        ELSE ''
 7426                                    END
 7427                                    {sql_set_info_option}
 7428                                )
 7429                        """
 7430                        self.conn.execute(sql_query_update)
 7431
 7432        else:
 7433
 7434            log.warning(f"No profiles in parameters")
 7435
 7436        # Remove added columns
 7437        for added_column in added_columns:
 7438            self.drop_column(column=added_column)
 7439
 7440        # Explode INFOS fields into table fields
 7441        if self.get_explode_infos():
 7442            self.explode_infos(
 7443                prefix=self.get_explode_infos_prefix(),
 7444                fields=self.get_explode_infos_fields(),
 7445                force=True,
 7446            )
 7447
 7448        return True
 7449
 7450    ###
 7451    # HGVS
 7452    ###
 7453
 7454    def annotation_hgvs(self, threads: int = None) -> None:
 7455        """
 7456        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7457        coordinates and alleles.
 7458
 7459        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7460        threads to use for parallel processing. If no value is provided, it will default to the number
 7461        of threads obtained from the `get_threads()` method
 7462        :type threads: int
 7463        """
 7464
 7465        # Function for each partition of the Dask Dataframe
 7466        def partition_function(partition):
 7467            """
 7468            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7469            each row of a DataFrame called `partition`.
 7470
 7471            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7472            to be processed
 7473            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7474            the "partition" dataframe along the axis 1.
 7475            """
 7476            return partition.apply(annotation_hgvs_partition, axis=1)
 7477
 7478        def annotation_hgvs_partition(row) -> str:
 7479            """
 7480            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7481            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7482
 7483            :param row: A dictionary-like object that contains the values for the following keys:
 7484            :return: a string that contains the HGVS names associated with the given row of data.
 7485            """
 7486
 7487            chr = row["CHROM"]
 7488            pos = row["POS"]
 7489            ref = row["REF"]
 7490            alt = row["ALT"]
 7491
 7492            # Find list of associated transcripts
 7493            transcripts_list = list(
 7494                polars_conn.execute(
 7495                    f"""
 7496                SELECT transcript
 7497                FROM refseq_df
 7498                WHERE CHROM='{chr}'
 7499                AND POS={pos}
 7500            """
 7501                )["transcript"]
 7502            )
 7503
 7504            # Full HGVS annotation in list
 7505            hgvs_full_list = []
 7506
 7507            for transcript_name in transcripts_list:
 7508
 7509                # Transcript
 7510                transcript = get_transcript(
 7511                    transcripts=transcripts, transcript_name=transcript_name
 7512                )
 7513                # Exon
 7514                if use_exon:
 7515                    exon = transcript.find_exon_number(pos)
 7516                else:
 7517                    exon = None
 7518                # Protein
 7519                transcript_protein = None
 7520                if use_protein or add_protein or full_format:
 7521                    transcripts_protein = list(
 7522                        polars_conn.execute(
 7523                            f"""
 7524                        SELECT protein
 7525                        FROM refseqlink_df
 7526                        WHERE transcript='{transcript_name}'
 7527                        LIMIT 1
 7528                    """
 7529                        )["protein"]
 7530                    )
 7531                    if len(transcripts_protein):
 7532                        transcript_protein = transcripts_protein[0]
 7533
 7534                # HGVS name
 7535                hgvs_name = format_hgvs_name(
 7536                    chr,
 7537                    pos,
 7538                    ref,
 7539                    alt,
 7540                    genome=genome,
 7541                    transcript=transcript,
 7542                    transcript_protein=transcript_protein,
 7543                    exon=exon,
 7544                    use_gene=use_gene,
 7545                    use_protein=use_protein,
 7546                    full_format=full_format,
 7547                    use_version=use_version,
 7548                    codon_type=codon_type,
 7549                )
 7550                hgvs_full_list.append(hgvs_name)
 7551                if add_protein and not use_protein and not full_format:
 7552                    hgvs_name = format_hgvs_name(
 7553                        chr,
 7554                        pos,
 7555                        ref,
 7556                        alt,
 7557                        genome=genome,
 7558                        transcript=transcript,
 7559                        transcript_protein=transcript_protein,
 7560                        exon=exon,
 7561                        use_gene=use_gene,
 7562                        use_protein=True,
 7563                        full_format=False,
 7564                        use_version=use_version,
 7565                        codon_type=codon_type,
 7566                    )
 7567                    hgvs_full_list.append(hgvs_name)
 7568
 7569            # Create liste of HGVS annotations
 7570            hgvs_full = ",".join(hgvs_full_list)
 7571
 7572            return hgvs_full
 7573
 7574        # Polars connexion
 7575        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7576
 7577        # Config
 7578        config = self.get_config()
 7579
 7580        # Databases
 7581        # Genome
 7582        databases_genomes_folders = (
 7583            config.get("folders", {})
 7584            .get("databases", {})
 7585            .get("genomes", DEFAULT_GENOME_FOLDER)
 7586        )
 7587        databases_genome = (
 7588            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7589        )
 7590        # refseq database folder
 7591        databases_refseq_folders = (
 7592            config.get("folders", {})
 7593            .get("databases", {})
 7594            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7595        )
 7596        # refseq
 7597        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7598        # refSeqLink
 7599        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7600
 7601        # Param
 7602        param = self.get_param()
 7603
 7604        # Quick HGVS
 7605        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7606            log.info(f"Quick HGVS Annotation:")
 7607            if not param.get("hgvs", None):
 7608                param["hgvs"] = {}
 7609            for option in param.get("hgvs_options", "").split(","):
 7610                option_var_val = option.split("=")
 7611                option_var = option_var_val[0]
 7612                if len(option_var_val) > 1:
 7613                    option_val = option_var_val[1]
 7614                else:
 7615                    option_val = "True"
 7616                if option_val.upper() in ["TRUE"]:
 7617                    option_val = True
 7618                elif option_val.upper() in ["FALSE"]:
 7619                    option_val = False
 7620                log.info(f"   {option_var}={option_val}")
 7621                param["hgvs"][option_var] = option_val
 7622
 7623        # Check if HGVS annotation enabled
 7624        if "hgvs" in param:
 7625            log.info(f"HGVS Annotation... ")
 7626            for hgvs_option in param.get("hgvs", {}):
 7627                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7628        else:
 7629            return
 7630
 7631        # HGVS Param
 7632        param_hgvs = param.get("hgvs", {})
 7633        use_exon = param_hgvs.get("use_exon", False)
 7634        use_gene = param_hgvs.get("use_gene", False)
 7635        use_protein = param_hgvs.get("use_protein", False)
 7636        add_protein = param_hgvs.get("add_protein", False)
 7637        full_format = param_hgvs.get("full_format", False)
 7638        use_version = param_hgvs.get("use_version", False)
 7639        codon_type = param_hgvs.get("codon_type", "3")
 7640
 7641        # refSseq refSeqLink
 7642        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7643        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7644
 7645        # Assembly
 7646        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7647
 7648        # Genome
 7649        genome_file = None
 7650        if find_genome(databases_genome):
 7651            genome_file = find_genome(databases_genome)
 7652        else:
 7653            genome_file = find_genome(
 7654                genome_path=databases_genomes_folders, assembly=assembly
 7655            )
 7656        log.debug("Genome: " + str(genome_file))
 7657
 7658        # refSseq
 7659        refseq_file = find_file_prefix(
 7660            input_file=databases_refseq,
 7661            prefix="ncbiRefSeq",
 7662            folder=databases_refseq_folders,
 7663            assembly=assembly,
 7664        )
 7665        log.debug("refSeq: " + str(refseq_file))
 7666
 7667        # refSeqLink
 7668        refseqlink_file = find_file_prefix(
 7669            input_file=databases_refseqlink,
 7670            prefix="ncbiRefSeqLink",
 7671            folder=databases_refseq_folders,
 7672            assembly=assembly,
 7673        )
 7674        log.debug("refSeqLink: " + str(refseqlink_file))
 7675
 7676        # Threads
 7677        if not threads:
 7678            threads = self.get_threads()
 7679        log.debug("Threads: " + str(threads))
 7680
 7681        # Variables
 7682        table_variants = self.get_table_variants(clause="update")
 7683
 7684        # Get variants SNV and InDel only
 7685        query_variants = f"""
 7686            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7687            FROM {table_variants}
 7688            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7689            """
 7690        df_variants = self.get_query_to_df(query_variants)
 7691
 7692        # Added columns
 7693        added_columns = []
 7694
 7695        # Add hgvs column in variants table
 7696        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7697        added_column = self.add_column(
 7698            table_variants, hgvs_column_name, "STRING", default_value=None
 7699        )
 7700        added_columns.append(added_column)
 7701
 7702        log.debug(f"refSeq loading...")
 7703        # refSeq in duckDB
 7704        refseq_table = get_refseq_table(
 7705            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7706        )
 7707        # Loading all refSeq in Dataframe
 7708        refseq_query = f"""
 7709            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7710            FROM {refseq_table}
 7711            JOIN df_variants ON (
 7712                {refseq_table}.chrom = df_variants.CHROM
 7713                AND {refseq_table}.txStart<=df_variants.POS
 7714                AND {refseq_table}.txEnd>=df_variants.POS
 7715            )
 7716        """
 7717        refseq_df = self.conn.query(refseq_query).pl()
 7718
 7719        if refseqlink_file:
 7720            log.debug(f"refSeqLink loading...")
 7721            # refSeqLink in duckDB
 7722            refseqlink_table = get_refseq_table(
 7723                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7724            )
 7725            # Loading all refSeqLink in Dataframe
 7726            protacc_column = "protAcc_with_ver"
 7727            mrnaacc_column = "mrnaAcc_with_ver"
 7728            refseqlink_query = f"""
 7729                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7730                FROM {refseqlink_table} 
 7731                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7732                WHERE protAcc_without_ver IS NOT NULL
 7733            """
 7734            # Polars Dataframe
 7735            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7736
 7737        # Read RefSeq transcripts into a python dict/model.
 7738        log.debug(f"Transcripts loading...")
 7739        with tempfile.TemporaryDirectory() as tmpdir:
 7740            transcripts_query = f"""
 7741                COPY (
 7742                    SELECT {refseq_table}.*
 7743                    FROM {refseq_table}
 7744                    JOIN df_variants ON (
 7745                        {refseq_table}.chrom=df_variants.CHROM
 7746                        AND {refseq_table}.txStart<=df_variants.POS
 7747                        AND {refseq_table}.txEnd>=df_variants.POS
 7748                    )
 7749                )
 7750                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7751            """
 7752            self.conn.query(transcripts_query)
 7753            with open(f"{tmpdir}/transcript.tsv") as infile:
 7754                transcripts = read_transcripts(infile)
 7755
 7756        # Polars connexion
 7757        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7758
 7759        log.debug("Genome loading...")
 7760        # Read genome sequence using pyfaidx.
 7761        genome = Fasta(genome_file)
 7762
 7763        log.debug("Start annotation HGVS...")
 7764
 7765        # Create
 7766        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7767        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7768
 7769        # Use dask.dataframe.apply() to apply function on each partition
 7770        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7771
 7772        # Convert Dask DataFrame to Pandas Dataframe
 7773        df = ddf.compute()
 7774
 7775        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7776        with tempfile.TemporaryDirectory() as tmpdir:
 7777            df_parquet = os.path.join(tmpdir, "df.parquet")
 7778            df.to_parquet(df_parquet)
 7779
 7780            # Update hgvs column
 7781            update_variant_query = f"""
 7782                UPDATE {table_variants}
 7783                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7784                FROM read_parquet('{df_parquet}') as df
 7785                WHERE variants."#CHROM" = df.CHROM
 7786                AND variants.POS = df.POS
 7787                AND variants.REF = df.REF
 7788                AND variants.ALT = df.ALT
 7789                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7790                """
 7791            self.execute_query(update_variant_query)
 7792
 7793        # Update INFO column
 7794        sql_query_update = f"""
 7795            UPDATE {table_variants}
 7796            SET INFO = 
 7797                concat(
 7798                    CASE 
 7799                        WHEN INFO NOT IN ('','.')
 7800                        THEN concat(INFO, ';')
 7801                        ELSE ''
 7802                    END,
 7803                    'hgvs=',
 7804                    {hgvs_column_name}
 7805                )
 7806            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7807            """
 7808        self.execute_query(sql_query_update)
 7809
 7810        # Add header
 7811        HGVS_INFOS = {
 7812            "hgvs": {
 7813                "ID": "hgvs",
 7814                "Number": ".",
 7815                "Type": "String",
 7816                "Description": f"HGVS annotatation with HOWARD",
 7817            }
 7818        }
 7819
 7820        for field in HGVS_INFOS:
 7821            field_ID = HGVS_INFOS[field]["ID"]
 7822            field_description = HGVS_INFOS[field]["Description"]
 7823            self.get_header().infos[field_ID] = vcf.parser._Info(
 7824                field_ID,
 7825                HGVS_INFOS[field]["Number"],
 7826                HGVS_INFOS[field]["Type"],
 7827                field_description,
 7828                "unknown",
 7829                "unknown",
 7830                code_type_map[HGVS_INFOS[field]["Type"]],
 7831            )
 7832
 7833        # Remove added columns
 7834        for added_column in added_columns:
 7835            self.drop_column(column=added_column)
 7836
 7837    ###
 7838    # Calculation
 7839    ###
 7840
 7841    def get_operations_help(
 7842        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7843    ) -> list:
 7844
 7845        # Init
 7846        operations_help = []
 7847
 7848        # operations
 7849        operations = self.get_config_json(
 7850            name="calculations",
 7851            config_dict=operations_config_dict,
 7852            config_file=operations_config_file,
 7853        )
 7854        for op in operations:
 7855            op_name = operations[op].get("name", op).upper()
 7856            op_description = operations[op].get("description", op_name)
 7857            op_available = operations[op].get("available", False)
 7858            if op_available:
 7859                operations_help.append(f"   {op_name}: {op_description}")
 7860
 7861        # Sort operations
 7862        operations_help.sort()
 7863
 7864        # insert header
 7865        operations_help.insert(0, "Available calculation operations:")
 7866
 7867        # Return
 7868        return operations_help
 7869
 7870    def calculation(
 7871        self,
 7872        operations: dict = {},
 7873        operations_config_dict: dict = {},
 7874        operations_config_file: str = None,
 7875    ) -> None:
 7876        """
 7877        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7878        operation, and then calls the appropriate function
 7879
 7880        param json example:
 7881            "calculation": {
 7882                "NOMEN": {
 7883                    "options": {
 7884                        "hgvs_field": "hgvs"
 7885                    },
 7886                "middle" : null
 7887            }
 7888        """
 7889
 7890        # Param
 7891        param = self.get_param()
 7892
 7893        # operations config
 7894        operations_config = self.get_config_json(
 7895            name="calculations",
 7896            config_dict=operations_config_dict,
 7897            config_file=operations_config_file,
 7898        )
 7899
 7900        # Upper keys
 7901        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7902
 7903        # Calculations
 7904
 7905        # Operations from param
 7906        operations = param.get("calculation", {}).get("calculations", operations)
 7907
 7908        # Quick calculation - add
 7909        if param.get("calculations", None):
 7910            calculations_list = [
 7911                value for value in param.get("calculations", "").split(",")
 7912            ]
 7913            log.info(f"Quick Calculations:")
 7914            for calculation_key in calculations_list:
 7915                log.info(f"   {calculation_key}")
 7916            for calculation_operation in calculations_list:
 7917                if calculation_operation.upper() not in operations:
 7918                    operations[calculation_operation.upper()] = {}
 7919                    add_value_into_dict(
 7920                        dict_tree=param,
 7921                        sections=[
 7922                            "calculation",
 7923                            "calculations",
 7924                            calculation_operation.upper(),
 7925                        ],
 7926                        value={},
 7927                    )
 7928
 7929        # Operations for calculation
 7930        if not operations:
 7931            operations = param.get("calculation", {}).get("calculations", {})
 7932
 7933        if operations:
 7934            log.info(f"Calculations...")
 7935
 7936        # For each operations
 7937        for operation_name in operations:
 7938            operation_name = operation_name.upper()
 7939            if operation_name not in [""]:
 7940                if operation_name in operations_config:
 7941                    log.info(f"Calculation '{operation_name}'")
 7942                    operation = operations_config[operation_name]
 7943                    operation_type = operation.get("type", "sql")
 7944                    if operation_type == "python":
 7945                        self.calculation_process_function(
 7946                            operation=operation, operation_name=operation_name
 7947                        )
 7948                    elif operation_type == "sql":
 7949                        self.calculation_process_sql(
 7950                            operation=operation, operation_name=operation_name
 7951                        )
 7952                    else:
 7953                        log.error(
 7954                            f"Operations config: Type '{operation_type}' NOT available"
 7955                        )
 7956                        raise ValueError(
 7957                            f"Operations config: Type '{operation_type}' NOT available"
 7958                        )
 7959                else:
 7960                    log.error(
 7961                        f"Operations config: Calculation '{operation_name}' NOT available"
 7962                    )
 7963                    raise ValueError(
 7964                        f"Operations config: Calculation '{operation_name}' NOT available"
 7965                    )
 7966
 7967        # Explode INFOS fields into table fields
 7968        if self.get_explode_infos():
 7969            self.explode_infos(
 7970                prefix=self.get_explode_infos_prefix(),
 7971                fields=self.get_explode_infos_fields(),
 7972                force=True,
 7973            )
 7974
 7975    def calculation_process_sql(
 7976        self, operation: dict, operation_name: str = "unknown"
 7977    ) -> None:
 7978        """
 7979        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7980        performs the operation, updating the specified table with the result.
 7981
 7982        :param operation: The `operation` parameter is a dictionary that contains information about the
 7983        mathematical operation to be performed. It includes the following keys:
 7984        :type operation: dict
 7985        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7986        the mathematical operation being performed. It is used for logging and error handling purposes,
 7987        defaults to unknown
 7988        :type operation_name: str (optional)
 7989        """
 7990
 7991        # table variants
 7992        table_variants = self.get_table_variants(clause="alter")
 7993
 7994        # Operation infos
 7995        operation_name = operation.get("name", "unknown")
 7996        log.debug(f"process sql {operation_name}")
 7997        output_column_name = operation.get("output_column_name", operation_name)
 7998        output_column_type = operation.get("output_column_type", "String")
 7999        prefix = operation.get("explode_infos_prefix", "")
 8000        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 8001        output_column_description = operation.get(
 8002            "output_column_description", f"{operation_name} operation"
 8003        )
 8004        operation_query = operation.get("operation_query", None)
 8005        if isinstance(operation_query, list):
 8006            operation_query = " ".join(operation_query)
 8007        operation_info_fields = operation.get("info_fields", [])
 8008        operation_info_fields_check = operation.get("info_fields_check", False)
 8009        operation_info = operation.get("operation_info", True)
 8010
 8011        if operation_query:
 8012
 8013            # Info fields check
 8014            operation_info_fields_check_result = True
 8015            if operation_info_fields_check:
 8016                header_infos = self.get_header().infos
 8017                for info_field in operation_info_fields:
 8018                    operation_info_fields_check_result = (
 8019                        operation_info_fields_check_result
 8020                        and info_field in header_infos
 8021                    )
 8022
 8023            # If info fields available
 8024            if operation_info_fields_check_result:
 8025
 8026                # Added_columns
 8027                added_columns = []
 8028
 8029                # Create VCF header field
 8030                vcf_reader = self.get_header()
 8031                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8032                    output_column_name,
 8033                    ".",
 8034                    output_column_type,
 8035                    output_column_description,
 8036                    "howard calculation",
 8037                    "0",
 8038                    self.code_type_map.get(output_column_type),
 8039                )
 8040
 8041                # Explode infos if needed
 8042                log.debug(f"calculation_process_sql prefix {prefix}")
 8043                added_columns += self.explode_infos(
 8044                    prefix=prefix,
 8045                    fields=[output_column_name] + operation_info_fields,
 8046                    force=True,
 8047                )
 8048
 8049                # Create column
 8050                added_column = self.add_column(
 8051                    table_name=table_variants,
 8052                    column_name=prefix + output_column_name,
 8053                    column_type=output_column_type_sql,
 8054                    default_value="null",
 8055                )
 8056                added_columns.append(added_column)
 8057
 8058                # Operation calculation
 8059                try:
 8060
 8061                    # Query to update calculation column
 8062                    sql_update = f"""
 8063                        UPDATE {table_variants}
 8064                        SET "{prefix}{output_column_name}" = ({operation_query})
 8065                    """
 8066                    self.conn.execute(sql_update)
 8067
 8068                    # Add to INFO
 8069                    if operation_info:
 8070                        sql_update_info = f"""
 8071                            UPDATE {table_variants}
 8072                            SET "INFO" =
 8073                                concat(
 8074                                    CASE
 8075                                        WHEN "INFO" IS NOT NULL
 8076                                        THEN concat("INFO", ';')
 8077                                        ELSE ''
 8078                                    END,
 8079                                    '{output_column_name}=',
 8080                                    "{prefix}{output_column_name}"
 8081                                )
 8082                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8083                        """
 8084                        self.conn.execute(sql_update_info)
 8085
 8086                except:
 8087                    log.error(
 8088                        f"Operations config: Calculation '{operation_name}' query failed"
 8089                    )
 8090                    raise ValueError(
 8091                        f"Operations config: Calculation '{operation_name}' query failed"
 8092                    )
 8093
 8094                # Remove added columns
 8095                for added_column in added_columns:
 8096                    log.debug(f"added_column: {added_column}")
 8097                    self.drop_column(column=added_column)
 8098
 8099            else:
 8100                log.error(
 8101                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8102                )
 8103                raise ValueError(
 8104                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8105                )
 8106
 8107        else:
 8108            log.error(
 8109                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8110            )
 8111            raise ValueError(
 8112                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8113            )
 8114
 8115    def calculation_process_function(
 8116        self, operation: dict, operation_name: str = "unknown"
 8117    ) -> None:
 8118        """
 8119        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8120        function with the given parameters.
 8121
 8122        :param operation: The `operation` parameter is a dictionary that contains information about the
 8123        operation to be performed. It has the following keys:
 8124        :type operation: dict
 8125        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8126        the operation being performed. It is used for logging purposes, defaults to unknown
 8127        :type operation_name: str (optional)
 8128        """
 8129
 8130        operation_name = operation["name"]
 8131        log.debug(f"process sql {operation_name}")
 8132        function_name = operation["function_name"]
 8133        function_params = operation["function_params"]
 8134        getattr(self, function_name)(*function_params)
 8135
 8136    def calculation_variant_id(self) -> None:
 8137        """
 8138        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8139        updates the INFO field of a variants table with the variant ID.
 8140        """
 8141
 8142        # variant_id annotation field
 8143        variant_id_tag = self.get_variant_id_column()
 8144        added_columns = [variant_id_tag]
 8145
 8146        # variant_id hgvs tags"
 8147        vcf_infos_tags = {
 8148            variant_id_tag: "howard variant ID annotation",
 8149        }
 8150
 8151        # Variants table
 8152        table_variants = self.get_table_variants()
 8153
 8154        # Header
 8155        vcf_reader = self.get_header()
 8156
 8157        # Add variant_id to header
 8158        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8159            variant_id_tag,
 8160            ".",
 8161            "String",
 8162            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8163            "howard calculation",
 8164            "0",
 8165            self.code_type_map.get("String"),
 8166        )
 8167
 8168        # Update
 8169        sql_update = f"""
 8170            UPDATE {table_variants}
 8171            SET "INFO" = 
 8172                concat(
 8173                    CASE
 8174                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8175                        THEN ''
 8176                        ELSE concat("INFO", ';')
 8177                    END,
 8178                    '{variant_id_tag}=',
 8179                    "{variant_id_tag}"
 8180                )
 8181        """
 8182        self.conn.execute(sql_update)
 8183
 8184        # Remove added columns
 8185        for added_column in added_columns:
 8186            self.drop_column(column=added_column)
 8187
 8188    def calculation_extract_snpeff_hgvs(
 8189        self,
 8190        snpeff_hgvs: str = "snpeff_hgvs",
 8191        snpeff_field: str = "ANN",
 8192    ) -> None:
 8193        """
 8194        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8195        annotation field in a VCF file and adds them as a new column in the variants table.
 8196
 8197        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8198        function is used to specify the name of the column that will store the HGVS nomenclatures
 8199        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8200        snpeff_hgvs
 8201        :type snpeff_hgvs: str (optional)
 8202        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8203        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8204        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8205        to ANN
 8206        :type snpeff_field: str (optional)
 8207        """
 8208
 8209        # Snpeff hgvs tags
 8210        vcf_infos_tags = {
 8211            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8212        }
 8213
 8214        # Prefix
 8215        prefix = self.get_explode_infos_prefix()
 8216        if prefix:
 8217            prefix = "INFO/"
 8218
 8219        # snpEff fields
 8220        speff_ann_infos = prefix + snpeff_field
 8221        speff_hgvs_infos = prefix + snpeff_hgvs
 8222
 8223        # Variants table
 8224        table_variants = self.get_table_variants()
 8225
 8226        # Header
 8227        vcf_reader = self.get_header()
 8228
 8229        # Add columns
 8230        added_columns = []
 8231
 8232        # Explode HGVS field in column
 8233        added_columns += self.explode_infos(fields=[snpeff_field])
 8234
 8235        if snpeff_field in vcf_reader.infos:
 8236
 8237            log.debug(vcf_reader.infos[snpeff_field])
 8238
 8239            # Extract ANN header
 8240            ann_description = vcf_reader.infos[snpeff_field].desc
 8241            pattern = r"'(.+?)'"
 8242            match = re.search(pattern, ann_description)
 8243            if match:
 8244                ann_header_match = match.group(1).split(" | ")
 8245                ann_header_desc = {}
 8246                for i in range(len(ann_header_match)):
 8247                    ann_header_info = "".join(
 8248                        char for char in ann_header_match[i] if char.isalnum()
 8249                    )
 8250                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8251                if not ann_header_desc:
 8252                    raise ValueError("Invalid header description format")
 8253            else:
 8254                raise ValueError("Invalid header description format")
 8255
 8256            # Create variant id
 8257            variant_id_column = self.get_variant_id_column()
 8258            added_columns += [variant_id_column]
 8259
 8260            # Create dataframe
 8261            dataframe_snpeff_hgvs = self.get_query_to_df(
 8262                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8263            )
 8264
 8265            # Create main NOMEN column
 8266            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8267                speff_ann_infos
 8268            ].apply(
 8269                lambda x: extract_snpeff_hgvs(
 8270                    str(x), header=list(ann_header_desc.values())
 8271                )
 8272            )
 8273
 8274            # Add snpeff_hgvs to header
 8275            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8276                snpeff_hgvs,
 8277                ".",
 8278                "String",
 8279                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8280                "howard calculation",
 8281                "0",
 8282                self.code_type_map.get("String"),
 8283            )
 8284
 8285            # Update
 8286            sql_update = f"""
 8287                UPDATE variants
 8288                SET "INFO" = 
 8289                    concat(
 8290                        CASE
 8291                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8292                            THEN ''
 8293                            ELSE concat("INFO", ';')
 8294                        END,
 8295                        CASE 
 8296                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8297                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8298                            THEN concat(
 8299                                    '{snpeff_hgvs}=',
 8300                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8301                                )
 8302                            ELSE ''
 8303                        END
 8304                    )
 8305                FROM dataframe_snpeff_hgvs
 8306                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8307
 8308            """
 8309            self.conn.execute(sql_update)
 8310
 8311            # Delete dataframe
 8312            del dataframe_snpeff_hgvs
 8313            gc.collect()
 8314
 8315        else:
 8316
 8317            log.warning(
 8318                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8319            )
 8320
 8321        # Remove added columns
 8322        for added_column in added_columns:
 8323            self.drop_column(column=added_column)
 8324
 8325    def calculation_snpeff_ann_explode(
 8326        self,
 8327        uniquify: bool = True,
 8328        output_format: str = "fields",
 8329        output_prefix: str = "snpeff_",
 8330        snpeff_field: str = "ANN",
 8331    ) -> None:
 8332        """
 8333        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8334        exploding the HGVS field and updating variant information accordingly.
 8335
 8336        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8337        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8338        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8339        defaults to True
 8340        :type uniquify: bool (optional)
 8341        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8342        function specifies the format in which the output annotations will be generated. It has a
 8343        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8344        format, defaults to fields
 8345        :type output_format: str (optional)
 8346        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8347        method is used to specify the prefix that will be added to the output annotations generated
 8348        during the calculation process. This prefix helps to differentiate the newly added annotations
 8349        from existing ones in the output data. By default, the, defaults to ANN_
 8350        :type output_prefix: str (optional)
 8351        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8352        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8353        field will be processed to explode the HGVS annotations and update the variant information
 8354        accordingly, defaults to ANN
 8355        :type snpeff_field: str (optional)
 8356        """
 8357
 8358        # SnpEff annotation field
 8359        snpeff_hgvs = "snpeff_ann_explode"
 8360
 8361        # Snpeff hgvs tags
 8362        vcf_infos_tags = {
 8363            snpeff_hgvs: "Explode snpEff annotations",
 8364        }
 8365
 8366        # Prefix
 8367        prefix = self.get_explode_infos_prefix()
 8368        if prefix:
 8369            prefix = "INFO/"
 8370
 8371        # snpEff fields
 8372        speff_ann_infos = prefix + snpeff_field
 8373        speff_hgvs_infos = prefix + snpeff_hgvs
 8374
 8375        # Variants table
 8376        table_variants = self.get_table_variants()
 8377
 8378        # Header
 8379        vcf_reader = self.get_header()
 8380
 8381        # Add columns
 8382        added_columns = []
 8383
 8384        # Explode HGVS field in column
 8385        added_columns += self.explode_infos(fields=[snpeff_field])
 8386        log.debug(f"snpeff_field={snpeff_field}")
 8387        log.debug(f"added_columns={added_columns}")
 8388
 8389        if snpeff_field in vcf_reader.infos:
 8390
 8391            # Extract ANN header
 8392            ann_description = vcf_reader.infos[snpeff_field].desc
 8393            pattern = r"'(.+?)'"
 8394            match = re.search(pattern, ann_description)
 8395            if match:
 8396                ann_header_match = match.group(1).split(" | ")
 8397                ann_header = []
 8398                ann_header_desc = {}
 8399                for i in range(len(ann_header_match)):
 8400                    ann_header_info = "".join(
 8401                        char for char in ann_header_match[i] if char.isalnum()
 8402                    )
 8403                    ann_header.append(ann_header_info)
 8404                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8405                if not ann_header_desc:
 8406                    raise ValueError("Invalid header description format")
 8407            else:
 8408                raise ValueError("Invalid header description format")
 8409
 8410            # Create variant id
 8411            variant_id_column = self.get_variant_id_column()
 8412            added_columns += [variant_id_column]
 8413
 8414            # Create dataframe
 8415            dataframe_snpeff_hgvs = self.get_query_to_df(
 8416                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8417            )
 8418
 8419            # Create snpEff columns
 8420            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8421                speff_ann_infos
 8422            ].apply(
 8423                lambda x: explode_snpeff_ann(
 8424                    str(x),
 8425                    uniquify=uniquify,
 8426                    output_format=output_format,
 8427                    prefix=output_prefix,
 8428                    header=list(ann_header_desc.values()),
 8429                )
 8430            )
 8431
 8432            # Header
 8433            ann_annotations_prefix = ""
 8434            if output_format.upper() in ["JSON"]:
 8435                ann_annotations_prefix = f"{output_prefix}="
 8436                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8437                    output_prefix,
 8438                    ".",
 8439                    "String",
 8440                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8441                    + " - JSON format",
 8442                    "howard calculation",
 8443                    "0",
 8444                    self.code_type_map.get("String"),
 8445                )
 8446            else:
 8447                for ann_annotation in ann_header:
 8448                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8449                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8450                        ann_annotation_id,
 8451                        ".",
 8452                        "String",
 8453                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8454                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8455                        "howard calculation",
 8456                        "0",
 8457                        self.code_type_map.get("String"),
 8458                    )
 8459
 8460            # Update
 8461            sql_update = f"""
 8462                UPDATE variants
 8463                SET "INFO" = 
 8464                    concat(
 8465                        CASE
 8466                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8467                            THEN ''
 8468                            ELSE concat("INFO", ';')
 8469                        END,
 8470                        CASE 
 8471                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8472                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8473                            THEN concat(
 8474                                '{ann_annotations_prefix}',
 8475                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8476                                )
 8477                            ELSE ''
 8478                        END
 8479                    )
 8480                FROM dataframe_snpeff_hgvs
 8481                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8482
 8483            """
 8484            self.conn.execute(sql_update)
 8485
 8486            # Delete dataframe
 8487            del dataframe_snpeff_hgvs
 8488            gc.collect()
 8489
 8490        else:
 8491
 8492            log.warning(
 8493                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8494            )
 8495
 8496        # Remove added columns
 8497        for added_column in added_columns:
 8498            self.drop_column(column=added_column)
 8499
 8500    def calculation_extract_nomen(self) -> None:
 8501        """
 8502        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8503        """
 8504
 8505        # NOMEN field
 8506        field_nomen_dict = "NOMEN_DICT"
 8507
 8508        # NOMEN structure
 8509        nomen_dict = {
 8510            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8511            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8512            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8513            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8514            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8515            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8516            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8517            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8518            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8519            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8520        }
 8521
 8522        # Param
 8523        param = self.get_param()
 8524
 8525        # Prefix
 8526        prefix = self.get_explode_infos_prefix()
 8527
 8528        # Header
 8529        vcf_reader = self.get_header()
 8530
 8531        # Get HGVS field
 8532        hgvs_field = (
 8533            param.get("calculation", {})
 8534            .get("calculations", {})
 8535            .get("NOMEN", {})
 8536            .get("options", {})
 8537            .get("hgvs_field", "hgvs")
 8538        )
 8539
 8540        # Get transcripts
 8541        transcripts_file = (
 8542            param.get("calculation", {})
 8543            .get("calculations", {})
 8544            .get("NOMEN", {})
 8545            .get("options", {})
 8546            .get("transcripts", None)
 8547        )
 8548        transcripts_file = full_path(transcripts_file)
 8549        transcripts = []
 8550        if transcripts_file:
 8551            if os.path.exists(transcripts_file):
 8552                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8553                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8554            else:
 8555                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8556                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8557
 8558        # Added columns
 8559        added_columns = []
 8560
 8561        # Explode HGVS field in column
 8562        added_columns += self.explode_infos(fields=[hgvs_field])
 8563
 8564        # extra infos
 8565        extra_infos = self.get_extra_infos()
 8566        extra_field = prefix + hgvs_field
 8567
 8568        if extra_field in extra_infos:
 8569
 8570            # Create dataframe
 8571            dataframe_hgvs = self.get_query_to_df(
 8572                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8573            )
 8574
 8575            # Create main NOMEN column
 8576            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8577                lambda x: find_nomen(str(x), transcripts=transcripts)
 8578            )
 8579
 8580            # Explode NOMEN Structure and create SQL set for update
 8581            sql_nomen_fields = []
 8582            for nomen_field in nomen_dict:
 8583
 8584                # Explode each field into a column
 8585                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8586                    lambda x: dict(x).get(nomen_field, "")
 8587                )
 8588
 8589                # Create VCF header field
 8590                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8591                    nomen_field,
 8592                    ".",
 8593                    "String",
 8594                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8595                    "howard calculation",
 8596                    "0",
 8597                    self.code_type_map.get("String"),
 8598                )
 8599                sql_nomen_fields.append(
 8600                    f"""
 8601                        CASE 
 8602                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8603                            THEN concat(
 8604                                    ';{nomen_field}=',
 8605                                    dataframe_hgvs."{nomen_field}"
 8606                                )
 8607                            ELSE ''
 8608                        END
 8609                    """
 8610                )
 8611
 8612            # SQL set for update
 8613            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8614
 8615            # Update
 8616            sql_update = f"""
 8617                UPDATE variants
 8618                SET "INFO" = 
 8619                    concat(
 8620                        CASE
 8621                            WHEN "INFO" IS NULL
 8622                            THEN ''
 8623                            ELSE "INFO"
 8624                        END,
 8625                        {sql_nomen_fields_set}
 8626                    )
 8627                FROM dataframe_hgvs
 8628                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8629                    AND variants."POS" = dataframe_hgvs."POS" 
 8630                    AND variants."REF" = dataframe_hgvs."REF"
 8631                    AND variants."ALT" = dataframe_hgvs."ALT"
 8632            """
 8633            self.conn.execute(sql_update)
 8634
 8635            # Delete dataframe
 8636            del dataframe_hgvs
 8637            gc.collect()
 8638
 8639        # Remove added columns
 8640        for added_column in added_columns:
 8641            self.drop_column(column=added_column)
 8642
 8643    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8644        """
 8645        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8646        pipeline/sample for a variant and updates the variant information in a VCF file.
 8647
 8648        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8649        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8650        VCF header and to update the corresponding field in the variants table, defaults to
 8651        findbypipeline
 8652        :type tag: str (optional)
 8653        """
 8654
 8655        # if FORMAT and samples
 8656        if (
 8657            "FORMAT" in self.get_header_columns_as_list()
 8658            and self.get_header_sample_list()
 8659        ):
 8660
 8661            # findbypipeline annotation field
 8662            findbypipeline_tag = tag
 8663
 8664            # VCF infos tags
 8665            vcf_infos_tags = {
 8666                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8667            }
 8668
 8669            # Prefix
 8670            prefix = self.get_explode_infos_prefix()
 8671
 8672            # Field
 8673            findbypipeline_infos = prefix + findbypipeline_tag
 8674
 8675            # Variants table
 8676            table_variants = self.get_table_variants()
 8677
 8678            # Header
 8679            vcf_reader = self.get_header()
 8680
 8681            # Create variant id
 8682            variant_id_column = self.get_variant_id_column()
 8683            added_columns = [variant_id_column]
 8684
 8685            # variant_id, FORMAT and samples
 8686            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8687                self.get_header_sample_list()
 8688            )
 8689
 8690            # Create dataframe
 8691            dataframe_findbypipeline = self.get_query_to_df(
 8692                f""" SELECT {samples_fields} FROM {table_variants} """
 8693            )
 8694
 8695            # Create findbypipeline column
 8696            dataframe_findbypipeline[findbypipeline_infos] = (
 8697                dataframe_findbypipeline.apply(
 8698                    lambda row: findbypipeline(
 8699                        row, samples=self.get_header_sample_list()
 8700                    ),
 8701                    axis=1,
 8702                )
 8703            )
 8704
 8705            # Add snpeff_hgvs to header
 8706            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8707                findbypipeline_tag,
 8708                ".",
 8709                "String",
 8710                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8711                "howard calculation",
 8712                "0",
 8713                self.code_type_map.get("String"),
 8714            )
 8715
 8716            # Update
 8717            sql_update = f"""
 8718                UPDATE variants
 8719                SET "INFO" = 
 8720                    concat(
 8721                        CASE
 8722                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8723                            THEN ''
 8724                            ELSE concat("INFO", ';')
 8725                        END,
 8726                        CASE 
 8727                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8728                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8729                            THEN concat(
 8730                                    '{findbypipeline_tag}=',
 8731                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8732                                )
 8733                            ELSE ''
 8734                        END
 8735                    )
 8736                FROM dataframe_findbypipeline
 8737                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8738            """
 8739            self.conn.execute(sql_update)
 8740
 8741            # Remove added columns
 8742            for added_column in added_columns:
 8743                self.drop_column(column=added_column)
 8744
 8745            # Delete dataframe
 8746            del dataframe_findbypipeline
 8747            gc.collect()
 8748
 8749    def calculation_genotype_concordance(self) -> None:
 8750        """
 8751        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8752        multi-caller VCF files and updates the variant information in the database.
 8753        """
 8754
 8755        # if FORMAT and samples
 8756        if (
 8757            "FORMAT" in self.get_header_columns_as_list()
 8758            and self.get_header_sample_list()
 8759        ):
 8760
 8761            # genotypeconcordance annotation field
 8762            genotypeconcordance_tag = "genotypeconcordance"
 8763
 8764            # VCF infos tags
 8765            vcf_infos_tags = {
 8766                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8767            }
 8768
 8769            # Prefix
 8770            prefix = self.get_explode_infos_prefix()
 8771
 8772            # Field
 8773            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8774
 8775            # Variants table
 8776            table_variants = self.get_table_variants()
 8777
 8778            # Header
 8779            vcf_reader = self.get_header()
 8780
 8781            # Create variant id
 8782            variant_id_column = self.get_variant_id_column()
 8783            added_columns = [variant_id_column]
 8784
 8785            # variant_id, FORMAT and samples
 8786            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8787                self.get_header_sample_list()
 8788            )
 8789
 8790            # Create dataframe
 8791            dataframe_genotypeconcordance = self.get_query_to_df(
 8792                f""" SELECT {samples_fields} FROM {table_variants} """
 8793            )
 8794
 8795            # Create genotypeconcordance column
 8796            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8797                dataframe_genotypeconcordance.apply(
 8798                    lambda row: genotypeconcordance(
 8799                        row, samples=self.get_header_sample_list()
 8800                    ),
 8801                    axis=1,
 8802                )
 8803            )
 8804
 8805            # Add genotypeconcordance to header
 8806            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8807                genotypeconcordance_tag,
 8808                ".",
 8809                "String",
 8810                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8811                "howard calculation",
 8812                "0",
 8813                self.code_type_map.get("String"),
 8814            )
 8815
 8816            # Update
 8817            sql_update = f"""
 8818                UPDATE variants
 8819                SET "INFO" = 
 8820                    concat(
 8821                        CASE
 8822                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8823                            THEN ''
 8824                            ELSE concat("INFO", ';')
 8825                        END,
 8826                        CASE
 8827                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8828                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8829                            THEN concat(
 8830                                    '{genotypeconcordance_tag}=',
 8831                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8832                                )
 8833                            ELSE ''
 8834                        END
 8835                    )
 8836                FROM dataframe_genotypeconcordance
 8837                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8838            """
 8839            self.conn.execute(sql_update)
 8840
 8841            # Remove added columns
 8842            for added_column in added_columns:
 8843                self.drop_column(column=added_column)
 8844
 8845            # Delete dataframe
 8846            del dataframe_genotypeconcordance
 8847            gc.collect()
 8848
 8849    def calculation_barcode(self, tag: str = "barcode") -> None:
 8850        """
 8851        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8852        updates the INFO field in the file with the calculated barcode values.
 8853
 8854        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8855        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8856        the default tag name is set to "barcode", defaults to barcode
 8857        :type tag: str (optional)
 8858        """
 8859
 8860        # if FORMAT and samples
 8861        if (
 8862            "FORMAT" in self.get_header_columns_as_list()
 8863            and self.get_header_sample_list()
 8864        ):
 8865
 8866            # barcode annotation field
 8867            if not tag:
 8868                tag = "barcode"
 8869
 8870            # VCF infos tags
 8871            vcf_infos_tags = {
 8872                tag: "barcode calculation (VaRank)",
 8873            }
 8874
 8875            # Prefix
 8876            prefix = self.get_explode_infos_prefix()
 8877
 8878            # Field
 8879            barcode_infos = prefix + tag
 8880
 8881            # Variants table
 8882            table_variants = self.get_table_variants()
 8883
 8884            # Header
 8885            vcf_reader = self.get_header()
 8886
 8887            # Create variant id
 8888            variant_id_column = self.get_variant_id_column()
 8889            added_columns = [variant_id_column]
 8890
 8891            # variant_id, FORMAT and samples
 8892            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8893                self.get_header_sample_list()
 8894            )
 8895
 8896            # Create dataframe
 8897            dataframe_barcode = self.get_query_to_df(
 8898                f""" SELECT {samples_fields} FROM {table_variants} """
 8899            )
 8900
 8901            # Create barcode column
 8902            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8903                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8904            )
 8905
 8906            # Add barcode to header
 8907            vcf_reader.infos[tag] = vcf.parser._Info(
 8908                tag,
 8909                ".",
 8910                "String",
 8911                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8912                "howard calculation",
 8913                "0",
 8914                self.code_type_map.get("String"),
 8915            )
 8916
 8917            # Update
 8918            sql_update = f"""
 8919                UPDATE {table_variants}
 8920                SET "INFO" = 
 8921                    concat(
 8922                        CASE
 8923                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8924                            THEN ''
 8925                            ELSE concat("INFO", ';')
 8926                        END,
 8927                        CASE
 8928                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8929                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8930                            THEN concat(
 8931                                    '{tag}=',
 8932                                    dataframe_barcode."{barcode_infos}"
 8933                                )
 8934                            ELSE ''
 8935                        END
 8936                    )
 8937                FROM dataframe_barcode
 8938                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8939            """
 8940            self.conn.execute(sql_update)
 8941
 8942            # Remove added columns
 8943            for added_column in added_columns:
 8944                self.drop_column(column=added_column)
 8945
 8946            # Delete dataframe
 8947            del dataframe_barcode
 8948            gc.collect()
 8949
 8950    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8951        """
 8952        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8953        and updates the INFO field in the file with the calculated barcode values.
 8954
 8955        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8956        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8957        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8958        :type tag: str (optional)
 8959        """
 8960
 8961        # if FORMAT and samples
 8962        if (
 8963            "FORMAT" in self.get_header_columns_as_list()
 8964            and self.get_header_sample_list()
 8965        ):
 8966
 8967            # barcode annotation field
 8968            if not tag:
 8969                tag = "BCF"
 8970
 8971            # VCF infos tags
 8972            vcf_infos_tags = {
 8973                tag: "barcode family calculation",
 8974                f"{tag}S": "barcode family samples",
 8975            }
 8976
 8977            # Param
 8978            param = self.get_param()
 8979            log.debug(f"param={param}")
 8980
 8981            # Prefix
 8982            prefix = self.get_explode_infos_prefix()
 8983
 8984            # PED param
 8985            ped = (
 8986                param.get("calculation", {})
 8987                .get("calculations", {})
 8988                .get("BARCODEFAMILY", {})
 8989                .get("family_pedigree", None)
 8990            )
 8991            log.debug(f"ped={ped}")
 8992
 8993            # Load PED
 8994            if ped:
 8995
 8996                # Pedigree is a file
 8997                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8998                    log.debug("Pedigree is file")
 8999                    with open(full_path(ped)) as ped:
 9000                        ped = json.load(ped)
 9001
 9002                # Pedigree is a string
 9003                elif isinstance(ped, str):
 9004                    log.debug("Pedigree is str")
 9005                    try:
 9006                        ped = json.loads(ped)
 9007                        log.debug("Pedigree is json str")
 9008                    except ValueError as e:
 9009                        ped_samples = ped.split(",")
 9010                        ped = {}
 9011                        for ped_sample in ped_samples:
 9012                            ped[ped_sample] = ped_sample
 9013
 9014                # Pedigree is a dict
 9015                elif isinstance(ped, dict):
 9016                    log.debug("Pedigree is dict")
 9017
 9018                # Pedigree is not well formatted
 9019                else:
 9020                    msg_error = "Pedigree not well formatted"
 9021                    log.error(msg_error)
 9022                    raise ValueError(msg_error)
 9023
 9024                # Construct list
 9025                ped_samples = list(ped.values())
 9026
 9027            else:
 9028                log.debug("Pedigree not defined. Take all samples")
 9029                ped_samples = self.get_header_sample_list()
 9030                ped = {}
 9031                for ped_sample in ped_samples:
 9032                    ped[ped_sample] = ped_sample
 9033
 9034            # Check pedigree
 9035            if not ped or len(ped) == 0:
 9036                msg_error = f"Error in pedigree: samples {ped_samples}"
 9037                log.error(msg_error)
 9038                raise ValueError(msg_error)
 9039
 9040            # Log
 9041            log.info(
 9042                "Calculation 'BARCODEFAMILY' - Samples: "
 9043                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9044            )
 9045            log.debug(f"ped_samples={ped_samples}")
 9046
 9047            # Field
 9048            barcode_infos = prefix + tag
 9049
 9050            # Variants table
 9051            table_variants = self.get_table_variants()
 9052
 9053            # Header
 9054            vcf_reader = self.get_header()
 9055
 9056            # Create variant id
 9057            variant_id_column = self.get_variant_id_column()
 9058            added_columns = [variant_id_column]
 9059
 9060            # variant_id, FORMAT and samples
 9061            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9062                ped_samples
 9063            )
 9064
 9065            # Create dataframe
 9066            dataframe_barcode = self.get_query_to_df(
 9067                f""" SELECT {samples_fields} FROM {table_variants} """
 9068            )
 9069
 9070            # Create barcode column
 9071            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9072                lambda row: barcode(row, samples=ped_samples), axis=1
 9073            )
 9074
 9075            # Add barcode family to header
 9076            # Add vaf_normalization to header
 9077            vcf_reader.formats[tag] = vcf.parser._Format(
 9078                id=tag,
 9079                num=".",
 9080                type="String",
 9081                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9082                type_code=self.code_type_map.get("String"),
 9083            )
 9084            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9085                id=f"{tag}S",
 9086                num=".",
 9087                type="String",
 9088                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9089                type_code=self.code_type_map.get("String"),
 9090            )
 9091
 9092            # Update
 9093            # for sample in ped_samples:
 9094            sql_update_set = []
 9095            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9096                if sample in ped_samples:
 9097                    value = f'dataframe_barcode."{barcode_infos}"'
 9098                    value_samples = "'" + ",".join(ped_samples) + "'"
 9099                elif sample == "FORMAT":
 9100                    value = f"'{tag}'"
 9101                    value_samples = f"'{tag}S'"
 9102                else:
 9103                    value = "'.'"
 9104                    value_samples = "'.'"
 9105                format_regex = r"[a-zA-Z0-9\s]"
 9106                sql_update_set.append(
 9107                    f"""
 9108                        "{sample}" = 
 9109                        concat(
 9110                            CASE
 9111                                WHEN {table_variants}."{sample}" = './.'
 9112                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9113                                ELSE {table_variants}."{sample}"
 9114                            END,
 9115                            ':',
 9116                            {value},
 9117                            ':',
 9118                            {value_samples}
 9119                        )
 9120                    """
 9121                )
 9122
 9123            sql_update_set_join = ", ".join(sql_update_set)
 9124            sql_update = f"""
 9125                UPDATE {table_variants}
 9126                SET {sql_update_set_join}
 9127                FROM dataframe_barcode
 9128                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9129            """
 9130            self.conn.execute(sql_update)
 9131
 9132            # Remove added columns
 9133            for added_column in added_columns:
 9134                self.drop_column(column=added_column)
 9135
 9136            # Delete dataframe
 9137            del dataframe_barcode
 9138            gc.collect()
 9139
 9140    def calculation_trio(self) -> None:
 9141        """
 9142        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9143        information to the INFO field of each variant.
 9144        """
 9145
 9146        # if FORMAT and samples
 9147        if (
 9148            "FORMAT" in self.get_header_columns_as_list()
 9149            and self.get_header_sample_list()
 9150        ):
 9151
 9152            # trio annotation field
 9153            trio_tag = "trio"
 9154
 9155            # VCF infos tags
 9156            vcf_infos_tags = {
 9157                "trio": "trio calculation",
 9158            }
 9159
 9160            # Param
 9161            param = self.get_param()
 9162
 9163            # Prefix
 9164            prefix = self.get_explode_infos_prefix()
 9165
 9166            # Trio param
 9167            trio_ped = (
 9168                param.get("calculation", {})
 9169                .get("calculations", {})
 9170                .get("TRIO", {})
 9171                .get("trio_pedigree", None)
 9172            )
 9173
 9174            # Load trio
 9175            if trio_ped:
 9176
 9177                # Trio pedigree is a file
 9178                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9179                    log.debug("TRIO pedigree is file")
 9180                    with open(full_path(trio_ped)) as trio_ped:
 9181                        trio_ped = json.load(trio_ped)
 9182
 9183                # Trio pedigree is a string
 9184                elif isinstance(trio_ped, str):
 9185                    log.debug("TRIO pedigree is str")
 9186                    try:
 9187                        trio_ped = json.loads(trio_ped)
 9188                        log.debug("TRIO pedigree is json str")
 9189                    except ValueError as e:
 9190                        trio_samples = trio_ped.split(",")
 9191                        if len(trio_samples) == 3:
 9192                            trio_ped = {
 9193                                "father": trio_samples[0],
 9194                                "mother": trio_samples[1],
 9195                                "child": trio_samples[2],
 9196                            }
 9197                            log.debug("TRIO pedigree is list str")
 9198                        else:
 9199                            msg_error = "TRIO pedigree not well formatted"
 9200                            log.error(msg_error)
 9201                            raise ValueError(msg_error)
 9202
 9203                # Trio pedigree is a dict
 9204                elif isinstance(trio_ped, dict):
 9205                    log.debug("TRIO pedigree is dict")
 9206
 9207                # Trio pedigree is not well formatted
 9208                else:
 9209                    msg_error = "TRIO pedigree not well formatted"
 9210                    log.error(msg_error)
 9211                    raise ValueError(msg_error)
 9212
 9213                # Construct trio list
 9214                trio_samples = [
 9215                    trio_ped.get("father", ""),
 9216                    trio_ped.get("mother", ""),
 9217                    trio_ped.get("child", ""),
 9218                ]
 9219
 9220            else:
 9221                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9222                samples_list = self.get_header_sample_list()
 9223                if len(samples_list) >= 3:
 9224                    trio_samples = self.get_header_sample_list()[0:3]
 9225                    trio_ped = {
 9226                        "father": trio_samples[0],
 9227                        "mother": trio_samples[1],
 9228                        "child": trio_samples[2],
 9229                    }
 9230                else:
 9231                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9232                    log.error(msg_error)
 9233                    raise ValueError(msg_error)
 9234
 9235            # Check trio pedigree
 9236            if not trio_ped or len(trio_ped) != 3:
 9237                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9238                log.error(msg_error)
 9239                raise ValueError(msg_error)
 9240
 9241            # Log
 9242            log.info(
 9243                f"Calculation 'TRIO' - Samples: "
 9244                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9245            )
 9246
 9247            # Field
 9248            trio_infos = prefix + trio_tag
 9249
 9250            # Variants table
 9251            table_variants = self.get_table_variants()
 9252
 9253            # Header
 9254            vcf_reader = self.get_header()
 9255
 9256            # Create variant id
 9257            variant_id_column = self.get_variant_id_column()
 9258            added_columns = [variant_id_column]
 9259
 9260            # variant_id, FORMAT and samples
 9261            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9262                self.get_header_sample_list()
 9263            )
 9264
 9265            # Create dataframe
 9266            dataframe_trio = self.get_query_to_df(
 9267                f""" SELECT {samples_fields} FROM {table_variants} """
 9268            )
 9269
 9270            # Create trio column
 9271            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9272                lambda row: trio(row, samples=trio_samples), axis=1
 9273            )
 9274
 9275            # Add trio to header
 9276            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9277                trio_tag,
 9278                ".",
 9279                "String",
 9280                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9281                "howard calculation",
 9282                "0",
 9283                self.code_type_map.get("String"),
 9284            )
 9285
 9286            # Update
 9287            sql_update = f"""
 9288                UPDATE {table_variants}
 9289                SET "INFO" = 
 9290                    concat(
 9291                        CASE
 9292                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9293                            THEN ''
 9294                            ELSE concat("INFO", ';')
 9295                        END,
 9296                        CASE
 9297                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9298                             AND dataframe_trio."{trio_infos}" NOT NULL
 9299                            THEN concat(
 9300                                    '{trio_tag}=',
 9301                                    dataframe_trio."{trio_infos}"
 9302                                )
 9303                            ELSE ''
 9304                        END
 9305                    )
 9306                FROM dataframe_trio
 9307                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9308            """
 9309            self.conn.execute(sql_update)
 9310
 9311            # Remove added columns
 9312            for added_column in added_columns:
 9313                self.drop_column(column=added_column)
 9314
 9315            # Delete dataframe
 9316            del dataframe_trio
 9317            gc.collect()
 9318
 9319    def calculation_vaf_normalization(self) -> None:
 9320        """
 9321        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9322        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9323        :return: The function does not return anything.
 9324        """
 9325
 9326        # if FORMAT and samples
 9327        if (
 9328            "FORMAT" in self.get_header_columns_as_list()
 9329            and self.get_header_sample_list()
 9330        ):
 9331
 9332            # vaf_normalization annotation field
 9333            vaf_normalization_tag = "VAF"
 9334
 9335            # VCF infos tags
 9336            vcf_infos_tags = {
 9337                "VAF": "VAF Variant Frequency",
 9338            }
 9339
 9340            # Prefix
 9341            prefix = self.get_explode_infos_prefix()
 9342
 9343            # Variants table
 9344            table_variants = self.get_table_variants()
 9345
 9346            # Header
 9347            vcf_reader = self.get_header()
 9348
 9349            # Do not calculate if VAF already exists
 9350            if "VAF" in vcf_reader.formats:
 9351                log.debug("VAF already on genotypes")
 9352                return
 9353
 9354            # Create variant id
 9355            variant_id_column = self.get_variant_id_column()
 9356            added_columns = [variant_id_column]
 9357
 9358            # variant_id, FORMAT and samples
 9359            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9360                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9361            )
 9362
 9363            # Create dataframe
 9364            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9365            log.debug(f"query={query}")
 9366            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9367
 9368            vaf_normalization_set = []
 9369
 9370            # for each sample vaf_normalization
 9371            for sample in self.get_header_sample_list():
 9372                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9373                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9374                )
 9375                vaf_normalization_set.append(
 9376                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9377                )
 9378
 9379            # Add VAF to FORMAT
 9380            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9381                "FORMAT"
 9382            ].apply(lambda x: str(x) + ":VAF")
 9383            vaf_normalization_set.append(
 9384                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9385            )
 9386
 9387            # Add vaf_normalization to header
 9388            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9389                id=vaf_normalization_tag,
 9390                num="1",
 9391                type="Float",
 9392                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9393                type_code=self.code_type_map.get("Float"),
 9394            )
 9395
 9396            # Create fields to add in INFO
 9397            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9398
 9399            # Update
 9400            sql_update = f"""
 9401                UPDATE {table_variants}
 9402                SET {sql_vaf_normalization_set}
 9403                FROM dataframe_vaf_normalization
 9404                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9405
 9406            """
 9407            self.conn.execute(sql_update)
 9408
 9409            # Remove added columns
 9410            for added_column in added_columns:
 9411                self.drop_column(column=added_column)
 9412
 9413            # Delete dataframe
 9414            del dataframe_vaf_normalization
 9415            gc.collect()
 9416
 9417    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9418        """
 9419        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9420        field in a VCF file and updates the INFO column of the variants table with the calculated
 9421        statistics.
 9422
 9423        :param info: The `info` parameter is a string that represents the type of information for which
 9424        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9425        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9426        maximum value, the mean, the median, defaults to VAF
 9427        :type info: str (optional)
 9428        """
 9429
 9430        # if FORMAT and samples
 9431        if (
 9432            "FORMAT" in self.get_header_columns_as_list()
 9433            and self.get_header_sample_list()
 9434        ):
 9435
 9436            # vaf_stats annotation field
 9437            vaf_stats_tag = info + "_stats"
 9438
 9439            # VCF infos tags
 9440            vcf_infos_tags = {
 9441                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9442                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9443                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9444                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9445                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9446                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9447                info
 9448                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9449            }
 9450
 9451            # Prefix
 9452            prefix = self.get_explode_infos_prefix()
 9453
 9454            # Field
 9455            vaf_stats_infos = prefix + vaf_stats_tag
 9456
 9457            # Variants table
 9458            table_variants = self.get_table_variants()
 9459
 9460            # Header
 9461            vcf_reader = self.get_header()
 9462
 9463            # Create variant id
 9464            variant_id_column = self.get_variant_id_column()
 9465            added_columns = [variant_id_column]
 9466
 9467            # variant_id, FORMAT and samples
 9468            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9469                self.get_header_sample_list()
 9470            )
 9471
 9472            # Create dataframe
 9473            dataframe_vaf_stats = self.get_query_to_df(
 9474                f""" SELECT {samples_fields} FROM {table_variants} """
 9475            )
 9476
 9477            # Create vaf_stats column
 9478            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9479                lambda row: genotype_stats(
 9480                    row, samples=self.get_header_sample_list(), info=info
 9481                ),
 9482                axis=1,
 9483            )
 9484
 9485            # List of vcf tags
 9486            sql_vaf_stats_fields = []
 9487
 9488            # Check all VAF stats infos
 9489            for stat in vcf_infos_tags:
 9490
 9491                # Extract stats
 9492                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9493                    lambda x: dict(x).get(stat, "")
 9494                )
 9495
 9496                # Add snpeff_hgvs to header
 9497                vcf_reader.infos[stat] = vcf.parser._Info(
 9498                    stat,
 9499                    ".",
 9500                    "String",
 9501                    vcf_infos_tags.get(stat, "genotype statistics"),
 9502                    "howard calculation",
 9503                    "0",
 9504                    self.code_type_map.get("String"),
 9505                )
 9506
 9507                if len(sql_vaf_stats_fields):
 9508                    sep = ";"
 9509                else:
 9510                    sep = ""
 9511
 9512                # Create fields to add in INFO
 9513                sql_vaf_stats_fields.append(
 9514                    f"""
 9515                        CASE
 9516                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9517                            THEN concat(
 9518                                    '{sep}{stat}=',
 9519                                    dataframe_vaf_stats."{stat}"
 9520                                )
 9521                            ELSE ''
 9522                        END
 9523                    """
 9524                )
 9525
 9526            # SQL set for update
 9527            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9528
 9529            # Update
 9530            sql_update = f"""
 9531                UPDATE {table_variants}
 9532                SET "INFO" = 
 9533                    concat(
 9534                        CASE
 9535                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9536                            THEN ''
 9537                            ELSE concat("INFO", ';')
 9538                        END,
 9539                        {sql_vaf_stats_fields_set}
 9540                    )
 9541                FROM dataframe_vaf_stats
 9542                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9543
 9544            """
 9545            self.conn.execute(sql_update)
 9546
 9547            # Remove added columns
 9548            for added_column in added_columns:
 9549                self.drop_column(column=added_column)
 9550
 9551            # Delete dataframe
 9552            del dataframe_vaf_stats
 9553            gc.collect()
 9554
 9555    def calculation_transcripts_annotation(
 9556        self, info_json: str = None, info_format: str = None
 9557    ) -> None:
 9558        """
 9559        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9560        field to it if transcripts are available.
 9561
 9562        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9563        is a string parameter that represents the information field to be used in the transcripts JSON.
 9564        It is used to specify the JSON format for the transcripts information. If no value is provided
 9565        when calling the method, it defaults to "
 9566        :type info_json: str
 9567        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9568        method is a string parameter that specifies the format of the information field to be used in
 9569        the transcripts JSON. It is used to define the format of the information field
 9570        :type info_format: str
 9571        """
 9572
 9573        # Create transcripts table
 9574        transcripts_table = self.create_transcript_view()
 9575
 9576        # Add info field
 9577        if transcripts_table:
 9578            self.transcript_view_to_variants(
 9579                transcripts_table=transcripts_table,
 9580                transcripts_info_field_json=info_json,
 9581                transcripts_info_field_format=info_format,
 9582            )
 9583        else:
 9584            log.info("No Transcripts to process. Check param.json file configuration")
 9585
 9586    def calculation_transcripts_prioritization(self) -> None:
 9587        """
 9588        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9589        prioritizes transcripts based on certain criteria.
 9590        """
 9591
 9592        # Create transcripts table
 9593        transcripts_table = self.create_transcript_view()
 9594
 9595        # Add info field
 9596        if transcripts_table:
 9597            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9598        else:
 9599            log.info("No Transcripts to process. Check param.json file configuration")
 9600
 9601    ###############
 9602    # Transcripts #
 9603    ###############
 9604
 9605    def transcripts_prioritization(
 9606        self, transcripts_table: str = None, param: dict = {}
 9607    ) -> bool:
 9608        """
 9609        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9610        and updates the variants table with the prioritized information.
 9611
 9612        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9613        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9614        This parameter is used to identify the table where the transcripts data is stored for the
 9615        prioritization process
 9616        :type transcripts_table: str
 9617        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9618        that contains various configuration settings for the prioritization process of transcripts. It
 9619        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9620        the prefix for prioritization fields, default profiles, and other
 9621        :type param: dict
 9622        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9623        transcripts prioritization process is successfully completed, and `False` if there are any
 9624        issues or if no profile is defined for transcripts prioritization.
 9625        """
 9626
 9627        log.debug("Start transcripts prioritization...")
 9628
 9629        # Param
 9630        if not param:
 9631            param = self.get_param()
 9632
 9633        # Variants table
 9634        table_variants = self.get_table_variants()
 9635        log.debug(f"transcripts_table={transcripts_table}")
 9636        # Transcripts table
 9637        if transcripts_table is None:
 9638            log.debug(f"transcripts_table={transcripts_table}")
 9639            transcripts_table = self.create_transcript_view(
 9640                transcripts_table="transcripts", param=param
 9641            )
 9642            log.debug(f"transcripts_table={transcripts_table}")
 9643        if transcripts_table is None:
 9644            msg_err = "No Transcripts table availalble"
 9645            log.error(msg_err)
 9646            raise ValueError(msg_err)
 9647
 9648        # Get transcripts columns
 9649        columns_as_list_query = f"""
 9650            DESCRIBE {transcripts_table}
 9651        """
 9652        columns_as_list = list(
 9653            self.get_query_to_df(columns_as_list_query)["column_name"]
 9654        )
 9655
 9656        # Create INFO if not exists
 9657        if "INFO" not in columns_as_list:
 9658            query_add_info = f"""
 9659                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9660            """
 9661            self.execute_query(query_add_info)
 9662
 9663        # Prioritization param and Force only PZ Score and Flag
 9664        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9665        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9666        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9667        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9668        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9669        pz_profile_default = (
 9670            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9671        )
 9672
 9673        # Exit if no profile
 9674        if pz_profile_default is None:
 9675            log.warning("No profile defined for transcripts prioritization")
 9676            return False
 9677
 9678        # Prioritization
 9679        prioritization_result = self.prioritization(
 9680            table=transcripts_table,
 9681            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9682        )
 9683        if not prioritization_result:
 9684            log.warning("Transcripts prioritization not processed")
 9685            return False
 9686
 9687        # Explode PZ fields
 9688        self.explode_infos(
 9689            table=transcripts_table,
 9690            fields=param.get("transcripts", {})
 9691            .get("prioritization", {})
 9692            .get("pzfields", []),
 9693        )
 9694
 9695        # Export Transcripts prioritization infos to variants table
 9696        query_update = f"""
 9697            WITH RankedTranscripts AS (
 9698                SELECT
 9699                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9700                    ROW_NUMBER() OVER (
 9701                        PARTITION BY "#CHROM", POS, REF, ALT
 9702                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9703                    ) AS rn
 9704                FROM
 9705                    {transcripts_table}
 9706            )
 9707            UPDATE {table_variants}
 9708                SET
 9709                INFO = CONCAT(CASE
 9710                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9711                            THEN ''
 9712                            ELSE concat("INFO", ';')
 9713                        END,
 9714                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9715                        )
 9716            FROM
 9717                RankedTranscripts
 9718            WHERE
 9719                rn = 1
 9720                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9721                AND variants."POS" = RankedTranscripts."POS"
 9722                AND variants."REF" = RankedTranscripts."REF"
 9723                AND variants."ALT" = RankedTranscripts."ALT"
 9724                
 9725        """
 9726        self.execute_query(query=query_update)
 9727
 9728        # Add PZ Transcript in header
 9729        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9730            pz_fields_transcripts,
 9731            ".",
 9732            "String",
 9733            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9734            "unknown",
 9735            "unknown",
 9736            code_type_map["String"],
 9737        )
 9738
 9739        # Return
 9740        return True
 9741
 9742    def create_transcript_view_from_columns_map(
 9743        self,
 9744        transcripts_table: str = "transcripts",
 9745        columns_maps: dict = {},
 9746        added_columns: list = [],
 9747        temporary_tables: list = None,
 9748        annotation_fields: list = None,
 9749    ) -> tuple[list, list, list]:
 9750        """
 9751        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9752        specified columns mapping for transcripts data.
 9753
 9754        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9755        the table where the transcripts data is stored or will be stored in the database. This table
 9756        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9757        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9758        :type transcripts_table: str (optional)
 9759        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9760        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9761        represents a mapping configuration for a specific set of columns. It typically includes details such
 9762        as the main transcript column and additional information columns
 9763        :type columns_maps: dict
 9764        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9765        function is a list that stores the additional columns that will be added to the view being created
 9766        based on the columns map provided. These columns are generated by exploding the transcript
 9767        information columns along with the main transcript column
 9768        :type added_columns: list
 9769        :param temporary_tables: The `temporary_tables` parameter in the
 9770        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9771        tables created during the process of creating a transcript view from a columns map. These temporary
 9772        tables are used to store intermediate results or transformations before the final view is generated
 9773        :type temporary_tables: list
 9774        :param annotation_fields: The `annotation_fields` parameter in the
 9775        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9776        for annotation in the query view creation process. These fields are extracted from the
 9777        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9778        :type annotation_fields: list
 9779        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9780        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9781        """
 9782
 9783        log.debug("Start transcrpts view creation from columns map...")
 9784
 9785        # "from_columns_map": [
 9786        #     {
 9787        #         "transcripts_column": "Ensembl_transcriptid",
 9788        #         "transcripts_infos_columns": [
 9789        #             "genename",
 9790        #             "Ensembl_geneid",
 9791        #             "LIST_S2_score",
 9792        #             "LIST_S2_pred",
 9793        #         ],
 9794        #     },
 9795        #     {
 9796        #         "transcripts_column": "Ensembl_transcriptid",
 9797        #         "transcripts_infos_columns": [
 9798        #             "genename",
 9799        #             "VARITY_R_score",
 9800        #             "Aloft_pred",
 9801        #         ],
 9802        #     },
 9803        # ],
 9804
 9805        # Init
 9806        if temporary_tables is None:
 9807            temporary_tables = []
 9808        if annotation_fields is None:
 9809            annotation_fields = []
 9810
 9811        # Variants table
 9812        table_variants = self.get_table_variants()
 9813
 9814        for columns_map in columns_maps:
 9815
 9816            # Transcript column
 9817            transcripts_column = columns_map.get("transcripts_column", None)
 9818
 9819            # Transcripts infos columns
 9820            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9821
 9822            if transcripts_column is not None:
 9823
 9824                # Explode
 9825                added_columns += self.explode_infos(
 9826                    fields=[transcripts_column] + transcripts_infos_columns
 9827                )
 9828
 9829                # View clauses
 9830                clause_select = []
 9831                for field in [transcripts_column] + transcripts_infos_columns:
 9832                    clause_select.append(
 9833                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9834                    )
 9835                    if field not in [transcripts_column]:
 9836                        annotation_fields.append(field)
 9837
 9838                # Querey View
 9839                query = f""" 
 9840                    SELECT
 9841                        "#CHROM", POS, REF, ALT, INFO,
 9842                        "{transcripts_column}" AS 'transcript',
 9843                        {", ".join(clause_select)}
 9844                    FROM (
 9845                        SELECT 
 9846                            "#CHROM", POS, REF, ALT, INFO,
 9847                            {", ".join(clause_select)}
 9848                        FROM {table_variants}
 9849                        )
 9850                    WHERE "{transcripts_column}" IS NOT NULL
 9851                """
 9852
 9853                # Create temporary table
 9854                temporary_table = transcripts_table + "".join(
 9855                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9856                )
 9857
 9858                # Temporary_tables
 9859                temporary_tables.append(temporary_table)
 9860                query_view = f"""
 9861                    CREATE TEMPORARY TABLE {temporary_table}
 9862                    AS ({query})
 9863                """
 9864                self.execute_query(query=query_view)
 9865
 9866        return added_columns, temporary_tables, annotation_fields
 9867
 9868    def create_transcript_view_from_column_format(
 9869        self,
 9870        transcripts_table: str = "transcripts",
 9871        column_formats: dict = {},
 9872        temporary_tables: list = None,
 9873        annotation_fields: list = None,
 9874    ) -> tuple[list, list, list]:
 9875        """
 9876        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9877        specified column formats, adds additional columns and annotation fields, and returns the list of
 9878        temporary tables and annotation fields.
 9879
 9880        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9881        the table containing the transcripts data. This table will be used as the base table for creating
 9882        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9883        different table name if needed, defaults to transcripts
 9884        :type transcripts_table: str (optional)
 9885        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9886        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9887        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9888        the provided code snippet:
 9889        :type column_formats: dict
 9890        :param temporary_tables: The `temporary_tables` parameter in the
 9891        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9892        views created during the process of creating a transcript view from a column format. These temporary
 9893        views are used to manipulate and extract data before generating the final transcript view. It
 9894        :type temporary_tables: list
 9895        :param annotation_fields: The `annotation_fields` parameter in the
 9896        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9897        that are extracted from the temporary views created during the process. These annotation fields are
 9898        obtained by querying the temporary views and extracting the column names excluding specific columns
 9899        like `#CH
 9900        :type annotation_fields: list
 9901        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9902        `temporary_tables` and `annotation_fields`.
 9903        """
 9904
 9905        log.debug("Start transcrpts view creation from column format...")
 9906
 9907        #  "from_column_format": [
 9908        #     {
 9909        #         "transcripts_column": "ANN",
 9910        #         "transcripts_infos_column": "Feature_ID",
 9911        #     }
 9912        # ],
 9913
 9914        # Init
 9915        if temporary_tables is None:
 9916            temporary_tables = []
 9917        if annotation_fields is None:
 9918            annotation_fields = []
 9919
 9920        for column_format in column_formats:
 9921
 9922            # annotation field and transcript annotation field
 9923            annotation_field = column_format.get("transcripts_column", "ANN")
 9924            transcript_annotation = column_format.get(
 9925                "transcripts_infos_column", "Feature_ID"
 9926            )
 9927
 9928            # Temporary View name
 9929            temporary_view_name = transcripts_table + "".join(
 9930                random.choices(string.ascii_uppercase + string.digits, k=10)
 9931            )
 9932
 9933            # Create temporary view name
 9934            temporary_view_name = self.annotation_format_to_table(
 9935                uniquify=True,
 9936                annotation_field=annotation_field,
 9937                view_name=temporary_view_name,
 9938                annotation_id=transcript_annotation,
 9939            )
 9940
 9941            # Annotation fields
 9942            if temporary_view_name:
 9943                query_annotation_fields = f"""
 9944                    SELECT *
 9945                    FROM (
 9946                        DESCRIBE SELECT *
 9947                        FROM {temporary_view_name}
 9948                        )
 9949                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9950                """
 9951                df_annotation_fields = self.get_query_to_df(
 9952                    query=query_annotation_fields
 9953                )
 9954
 9955                # Add temporary view and annotation fields
 9956                temporary_tables.append(temporary_view_name)
 9957                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9958
 9959        return temporary_tables, annotation_fields
 9960
 9961    def create_transcript_view(
 9962        self,
 9963        transcripts_table: str = None,
 9964        transcripts_table_drop: bool = True,
 9965        param: dict = {},
 9966    ) -> str:
 9967        """
 9968        The `create_transcript_view` function generates a transcript view by processing data from a
 9969        specified table based on provided parameters and structural information.
 9970
 9971        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9972        is used to specify the name of the table that will store the final transcript view data. If a table
 9973        name is not provided, the function will create a new table to store the transcript view data, and by
 9974        default,, defaults to transcripts
 9975        :type transcripts_table: str (optional)
 9976        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9977        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9978        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9979        the function will drop the existing transcripts table if it exists, defaults to True
 9980        :type transcripts_table_drop: bool (optional)
 9981        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9982        contains information needed to create a transcript view. It includes details such as the structure
 9983        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9984        the view. This parameter allows for flexibility and customization
 9985        :type param: dict
 9986        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9987        created or modified during the execution of the function.
 9988        """
 9989
 9990        log.debug("Start transcripts view creation...")
 9991
 9992        # Default
 9993        transcripts_table_default = "transcripts"
 9994
 9995        # Param
 9996        if not param:
 9997            param = self.get_param()
 9998
 9999        # Struct
10000        struct = param.get("transcripts", {}).get("struct", None)
10001
10002        if struct:
10003
10004            # Transcripts table
10005            if transcripts_table is None:
10006                transcripts_table = param.get("transcripts", {}).get(
10007                    "table", transcripts_table_default
10008                )
10009
10010            # added_columns
10011            added_columns = []
10012
10013            # Temporary tables
10014            temporary_tables = []
10015
10016            # Annotation fields
10017            annotation_fields = []
10018
10019            # from columns map
10020            columns_maps = struct.get("from_columns_map", [])
10021            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10022                self.create_transcript_view_from_columns_map(
10023                    transcripts_table=transcripts_table,
10024                    columns_maps=columns_maps,
10025                    added_columns=added_columns,
10026                    temporary_tables=temporary_tables,
10027                    annotation_fields=annotation_fields,
10028                )
10029            )
10030            added_columns += added_columns_tmp
10031            temporary_tables += temporary_tables_tmp
10032            annotation_fields += annotation_fields_tmp
10033
10034            # from column format
10035            column_formats = struct.get("from_column_format", [])
10036            temporary_tables_tmp, annotation_fields_tmp = (
10037                self.create_transcript_view_from_column_format(
10038                    transcripts_table=transcripts_table,
10039                    column_formats=column_formats,
10040                    temporary_tables=temporary_tables,
10041                    annotation_fields=annotation_fields,
10042                )
10043            )
10044            temporary_tables += temporary_tables_tmp
10045            annotation_fields += annotation_fields_tmp
10046
10047            # Merge temporary tables query
10048            query_merge = ""
10049            for temporary_table in temporary_tables:
10050
10051                # First temporary table
10052                if not query_merge:
10053                    query_merge = f"""
10054                        SELECT * FROM {temporary_table}
10055                    """
10056                # other temporary table (using UNION)
10057                else:
10058                    query_merge += f"""
10059                        UNION BY NAME SELECT * FROM {temporary_table}
10060                    """
10061
10062            # Merge on transcript
10063            query_merge_on_transcripts_annotation_fields = []
10064            # Aggregate all annotations fields
10065            for annotation_field in set(annotation_fields):
10066                query_merge_on_transcripts_annotation_fields.append(
10067                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
10068                )
10069            # Query for transcripts view
10070            query_merge_on_transcripts = f"""
10071                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
10072                FROM ({query_merge})
10073                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
10074            """
10075
10076            # Drop transcript view is necessary
10077            if transcripts_table_drop:
10078                query_drop = f"""
10079                    DROP TABLE IF EXISTS {transcripts_table};
10080                """
10081                self.execute_query(query=query_drop)
10082
10083            # Merge and create transcript view
10084            query_create_view = f"""
10085                CREATE TABLE IF NOT EXISTS {transcripts_table}
10086                AS {query_merge_on_transcripts}
10087            """
10088            self.execute_query(query=query_create_view)
10089
10090            # Remove added columns
10091            for added_column in added_columns:
10092                self.drop_column(column=added_column)
10093
10094        else:
10095
10096            transcripts_table = None
10097
10098        return transcripts_table
10099
10100    def annotation_format_to_table(
10101        self,
10102        uniquify: bool = True,
10103        annotation_field: str = "ANN",
10104        annotation_id: str = "Feature_ID",
10105        view_name: str = "transcripts",
10106    ) -> str:
10107        """
10108        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
10109        table format.
10110
10111        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
10112        values in the output or not. If set to `True`, the function will make sure that the output values
10113        are unique, defaults to True
10114        :type uniquify: bool (optional)
10115        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10116        contains the annotation information for each variant. This field is used to extract the annotation
10117        details for further processing in the function, defaults to ANN
10118        :type annotation_field: str (optional)
10119        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10120        used to specify the identifier for the annotation feature. This identifier will be used as a column
10121        name in the resulting table or view that is created based on the annotation data. It helps in
10122        uniquely identifying each annotation entry in the, defaults to Feature_ID
10123        :type annotation_id: str (optional)
10124        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10125        specify the name of the temporary table that will be created to store the transformed annotation
10126        data. This table will hold the extracted information from the annotation field in a structured
10127        format for further processing or analysis, defaults to transcripts
10128        :type view_name: str (optional)
10129        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10130        is stored in the variable `view_name`.
10131        """
10132
10133        # Annotation field
10134        annotation_format = "annotation_explode"
10135
10136        # Transcript annotation
10137        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10138
10139        # Prefix
10140        prefix = self.get_explode_infos_prefix()
10141        if prefix:
10142            prefix = "INFO/"
10143
10144        # Annotation fields
10145        annotation_infos = prefix + annotation_field
10146        annotation_format_infos = prefix + annotation_format
10147
10148        # Variants table
10149        table_variants = self.get_table_variants()
10150
10151        # Header
10152        vcf_reader = self.get_header()
10153
10154        # Add columns
10155        added_columns = []
10156
10157        # Explode HGVS field in column
10158        added_columns += self.explode_infos(fields=[annotation_field])
10159
10160        if annotation_field in vcf_reader.infos:
10161
10162            # Extract ANN header
10163            ann_description = vcf_reader.infos[annotation_field].desc
10164            pattern = r"'(.+?)'"
10165            match = re.search(pattern, ann_description)
10166            if match:
10167                ann_header_match = match.group(1).split(" | ")
10168                ann_header = []
10169                ann_header_desc = {}
10170                for i in range(len(ann_header_match)):
10171                    ann_header_info = "".join(
10172                        char for char in ann_header_match[i] if char.isalnum()
10173                    )
10174                    ann_header.append(ann_header_info)
10175                    ann_header_desc[ann_header_info] = ann_header_match[i]
10176                if not ann_header_desc:
10177                    raise ValueError("Invalid header description format")
10178            else:
10179                raise ValueError("Invalid header description format")
10180
10181            # Create variant id
10182            variant_id_column = self.get_variant_id_column()
10183            added_columns += [variant_id_column]
10184
10185            # Create dataframe
10186            dataframe_annotation_format = self.get_query_to_df(
10187                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10188            )
10189
10190            # Create annotation columns
10191            dataframe_annotation_format[
10192                annotation_format_infos
10193            ] = dataframe_annotation_format[annotation_infos].apply(
10194                lambda x: explode_annotation_format(
10195                    annotation=str(x),
10196                    uniquify=uniquify,
10197                    output_format="JSON",
10198                    prefix="",
10199                    header=list(ann_header_desc.values()),
10200                )
10201            )
10202
10203            # Find keys
10204            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10205            df_keys = self.get_query_to_df(query=query_json)
10206
10207            # Check keys
10208            query_json_key = []
10209            for _, row in df_keys.iterrows():
10210
10211                # Key
10212                key = row.iloc[0]
10213
10214                # key_clean
10215                key_clean = "".join(char for char in key if char.isalnum())
10216
10217                # Type
10218                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10219
10220                # Get DataFrame from query
10221                df_json_type = self.get_query_to_df(query=query_json_type)
10222
10223                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10224                with pd.option_context("future.no_silent_downcasting", True):
10225                    df_json_type.fillna(value="", inplace=True)
10226                    replace_dict = {None: np.nan, "": np.nan}
10227                    df_json_type.replace(replace_dict, inplace=True)
10228                    df_json_type.dropna(inplace=True)
10229
10230                # Detect column type
10231                column_type = detect_column_type(df_json_type[key_clean])
10232
10233                # Append
10234                query_json_key.append(
10235                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10236                )
10237
10238            # Create view
10239            query_view = f"""
10240                CREATE TEMPORARY TABLE {view_name}
10241                AS (
10242                    SELECT *, {annotation_id} AS 'transcript'
10243                    FROM (
10244                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10245                        FROM dataframe_annotation_format
10246                        )
10247                    );
10248            """
10249            self.execute_query(query=query_view)
10250
10251        else:
10252
10253            # Return None
10254            view_name = None
10255
10256        # Remove added columns
10257        for added_column in added_columns:
10258            self.drop_column(column=added_column)
10259
10260        return view_name
10261
10262    def transcript_view_to_variants(
10263        self,
10264        transcripts_table: str = None,
10265        transcripts_column_id: str = None,
10266        transcripts_info_json: str = None,
10267        transcripts_info_field_json: str = None,
10268        transcripts_info_format: str = None,
10269        transcripts_info_field_format: str = None,
10270        param: dict = {},
10271    ) -> bool:
10272        """
10273        The `transcript_view_to_variants` function updates a variants table with information from
10274        transcripts in JSON format.
10275
10276        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10277        table containing the transcripts data. If this parameter is not provided, the function will
10278        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10279        :type transcripts_table: str
10280        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10281        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10282        identifier is used to match transcripts with variants in the database
10283        :type transcripts_column_id: str
10284        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10285        of the column in the variants table where the transcripts information will be stored in JSON
10286        format. This parameter allows you to define the column in the variants table that will hold the
10287        JSON-formatted information about transcripts
10288        :type transcripts_info_json: str
10289        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10290        specify the field in the VCF header that will contain information about transcripts in JSON
10291        format. This field will be added to the VCF header as an INFO field with the specified name
10292        :type transcripts_info_field_json: str
10293        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10294        format of the information about transcripts that will be stored in the variants table. This
10295        format can be used to define how the transcript information will be structured or displayed
10296        within the variants table
10297        :type transcripts_info_format: str
10298        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10299        specify the field in the VCF header that will contain information about transcripts in a
10300        specific format. This field will be added to the VCF header as an INFO field with the specified
10301        name
10302        :type transcripts_info_field_format: str
10303        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10304        that contains various configuration settings related to transcripts. It is used to provide
10305        default values for certain parameters if they are not explicitly provided when calling the
10306        method. The `param` dictionary can be passed as an argument
10307        :type param: dict
10308        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10309        if the operation is successful and `False` if certain conditions are not met.
10310        """
10311
10312        msg_info_prefix = "Start transcripts view to variants annotations"
10313
10314        log.debug(f"{msg_info_prefix}...")
10315
10316        # Default
10317        transcripts_table_default = "transcripts"
10318        transcripts_column_id_default = "transcript"
10319        transcripts_info_json_default = None
10320        transcripts_info_format_default = None
10321        transcripts_info_field_json_default = None
10322        transcripts_info_field_format_default = None
10323
10324        # Param
10325        if not param:
10326            param = self.get_param()
10327
10328        # Transcripts table
10329        if transcripts_table is None:
10330            transcripts_table = param.get("transcripts", {}).get(
10331                "table", transcripts_table_default
10332            )
10333
10334        # Transcripts column ID
10335        if transcripts_column_id is None:
10336            transcripts_column_id = param.get("transcripts", {}).get(
10337                "column_id", transcripts_column_id_default
10338            )
10339
10340        # Transcripts info json
10341        if transcripts_info_json is None:
10342            transcripts_info_json = param.get("transcripts", {}).get(
10343                "transcripts_info_json", transcripts_info_json_default
10344            )
10345
10346        # Transcripts info field JSON
10347        if transcripts_info_field_json is None:
10348            transcripts_info_field_json = param.get("transcripts", {}).get(
10349                "transcripts_info_field_json", transcripts_info_field_json_default
10350            )
10351        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10352        #     transcripts_info_json = transcripts_info_field_json
10353
10354        # Transcripts info format
10355        if transcripts_info_format is None:
10356            transcripts_info_format = param.get("transcripts", {}).get(
10357                "transcripts_info_format", transcripts_info_format_default
10358            )
10359
10360        # Transcripts info field FORMAT
10361        if transcripts_info_field_format is None:
10362            transcripts_info_field_format = param.get("transcripts", {}).get(
10363                "transcripts_info_field_format", transcripts_info_field_format_default
10364            )
10365        # if (
10366        #     transcripts_info_field_format is not None
10367        #     and transcripts_info_format is None
10368        # ):
10369        #     transcripts_info_format = transcripts_info_field_format
10370
10371        # Variants table
10372        table_variants = self.get_table_variants()
10373
10374        # Check info columns param
10375        if (
10376            transcripts_info_json is None
10377            and transcripts_info_field_json is None
10378            and transcripts_info_format is None
10379            and transcripts_info_field_format is None
10380        ):
10381            return False
10382
10383        # Transcripts infos columns
10384        query_transcripts_infos_columns = f"""
10385            SELECT *
10386            FROM (
10387                DESCRIBE SELECT * FROM {transcripts_table}
10388                )
10389            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10390        """
10391        transcripts_infos_columns = list(
10392            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10393        )
10394
10395        # View results
10396        clause_select = []
10397        clause_to_json = []
10398        clause_to_format = []
10399        for field in transcripts_infos_columns:
10400            clause_select.append(
10401                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10402            )
10403            clause_to_json.append(f""" '{field}': "{field}" """)
10404            clause_to_format.append(f""" "{field}" """)
10405
10406        # Update
10407        update_set_json = []
10408        update_set_format = []
10409
10410        # VCF header
10411        vcf_reader = self.get_header()
10412
10413        # Transcripts to info column in JSON
10414        if transcripts_info_json is not None:
10415
10416            # Create column on variants table
10417            self.add_column(
10418                table_name=table_variants,
10419                column_name=transcripts_info_json,
10420                column_type="JSON",
10421                default_value=None,
10422                drop=False,
10423            )
10424
10425            # Add header
10426            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10427                transcripts_info_json,
10428                ".",
10429                "String",
10430                "Transcripts in JSON format",
10431                "unknwon",
10432                "unknwon",
10433                self.code_type_map["String"],
10434            )
10435
10436            # Add to update
10437            update_set_json.append(
10438                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10439            )
10440
10441        # Transcripts to info field in JSON
10442        if transcripts_info_field_json is not None:
10443
10444            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10445
10446            # Add to update
10447            update_set_json.append(
10448                f""" 
10449                    INFO = concat(
10450                            CASE
10451                                WHEN INFO NOT IN ('', '.')
10452                                THEN INFO
10453                                ELSE ''
10454                            END,
10455                            CASE
10456                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10457                                THEN concat(
10458                                    ';{transcripts_info_field_json}=',
10459                                    t.{transcripts_info_json}
10460                                )
10461                                ELSE ''
10462                            END
10463                            )
10464                """
10465            )
10466
10467            # Add header
10468            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10469                transcripts_info_field_json,
10470                ".",
10471                "String",
10472                "Transcripts in JSON format",
10473                "unknwon",
10474                "unknwon",
10475                self.code_type_map["String"],
10476            )
10477
10478        if update_set_json:
10479
10480            # Update query
10481            query_update = f"""
10482                UPDATE {table_variants}
10483                    SET {", ".join(update_set_json)}
10484                FROM
10485                (
10486                    SELECT
10487                        "#CHROM", POS, REF, ALT,
10488                            concat(
10489                            '{{',
10490                            string_agg(
10491                                '"' || "{transcripts_column_id}" || '":' ||
10492                                to_json(json_output)
10493                            ),
10494                            '}}'
10495                            )::JSON AS {transcripts_info_json}
10496                    FROM
10497                        (
10498                        SELECT
10499                            "#CHROM", POS, REF, ALT,
10500                            "{transcripts_column_id}",
10501                            to_json(
10502                                {{{",".join(clause_to_json)}}}
10503                            )::JSON AS json_output
10504                        FROM
10505                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10506                        WHERE "{transcripts_column_id}" IS NOT NULL
10507                        )
10508                    GROUP BY "#CHROM", POS, REF, ALT
10509                ) AS t
10510                WHERE {table_variants}."#CHROM" = t."#CHROM"
10511                    AND {table_variants}."POS" = t."POS"
10512                    AND {table_variants}."REF" = t."REF"
10513                    AND {table_variants}."ALT" = t."ALT"
10514            """
10515
10516            self.execute_query(query=query_update)
10517
10518        # Transcripts to info column in FORMAT
10519        if transcripts_info_format is not None:
10520
10521            # Create column on variants table
10522            self.add_column(
10523                table_name=table_variants,
10524                column_name=transcripts_info_format,
10525                column_type="VARCHAR",
10526                default_value=None,
10527                drop=False,
10528            )
10529
10530            # Add header
10531            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10532                transcripts_info_format,
10533                ".",
10534                "String",
10535                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10536                "unknwon",
10537                "unknwon",
10538                self.code_type_map["String"],
10539            )
10540
10541            # Add to update
10542            update_set_format.append(
10543                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10544            )
10545
10546        # Transcripts to info field in JSON
10547        if transcripts_info_field_format is not None:
10548
10549            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10550
10551            # Add to update
10552            update_set_format.append(
10553                f""" 
10554                    INFO = concat(
10555                            CASE
10556                                WHEN INFO NOT IN ('', '.')
10557                                THEN INFO
10558                                ELSE ''
10559                            END,
10560                            CASE
10561                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10562                                THEN concat(
10563                                    ';{transcripts_info_field_format}=',
10564                                    t.{transcripts_info_format}
10565                                )
10566                                ELSE ''
10567                            END
10568                            )
10569                """
10570            )
10571
10572            # Add header
10573            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10574                transcripts_info_field_format,
10575                ".",
10576                "String",
10577                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10578                "unknwon",
10579                "unknwon",
10580                self.code_type_map["String"],
10581            )
10582
10583        if update_set_format:
10584
10585            # Update query
10586            query_update = f"""
10587                UPDATE {table_variants}
10588                    SET {", ".join(update_set_format)}
10589                FROM
10590                (
10591                    SELECT
10592                        "#CHROM", POS, REF, ALT,
10593                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10594                    FROM 
10595                        (
10596                        SELECT
10597                            "#CHROM", POS, REF, ALT,
10598                            "{transcripts_column_id}",
10599                            concat(
10600                                "{transcripts_column_id}",
10601                                '|',
10602                                {", '|', ".join(clause_to_format)}
10603                            ) AS {transcripts_info_format}
10604                        FROM
10605                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10606                        )
10607                    GROUP BY "#CHROM", POS, REF, ALT
10608                ) AS t
10609                WHERE {table_variants}."#CHROM" = t."#CHROM"
10610                    AND {table_variants}."POS" = t."POS"
10611                    AND {table_variants}."REF" = t."REF"
10612                    AND {table_variants}."ALT" = t."ALT"
10613            """
10614
10615            self.execute_query(query=query_update)
10616
10617        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Samples
78        self.set_samples()
79
80        # Load data
81        if load:
82            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 84    def set_samples(self, samples: list = None) -> list:
 85        """
 86        The function `set_samples` sets the samples attribute of an object to a provided list or
 87        retrieves it from a parameter dictionary.
 88
 89        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 90        input and sets the `samples` attribute of the class to the provided list. If no samples are
 91        provided, it tries to get the samples from the class's parameters using the `get_param` method
 92        :type samples: list
 93        :return: The `samples` list is being returned.
 94        """
 95
 96        if not samples:
 97            samples = self.get_param().get("samples", {}).get("list", None)
 98
 99        self.samples = samples
100
101        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
103    def get_samples(self) -> list:
104        """
105        This function returns a list of samples.
106        :return: The `get_samples` method is returning the `samples` attribute of the object.
107        """
108
109        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
111    def get_samples_check(self) -> bool:
112        """
113        This function returns the value of the "check" key within the "samples" dictionary retrieved
114        from the parameters.
115        :return: The method `get_samples_check` is returning the value of the key "check" inside the
116        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
117        method. If the key "check" is not found, it will return `False`.
118        """
119
120        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
122    def set_input(self, input: str = None) -> None:
123        """
124        The function `set_input` takes a file name as input, extracts the name and extension, and sets
125        attributes in the class accordingly.
126
127        :param input: The `set_input` method in the provided code snippet is used to set attributes
128        related to the input file. Here's a breakdown of the parameters and their usage in the method:
129        :type input: str
130        """
131
132        if input and not isinstance(input, str):
133            try:
134                self.input = input.name
135            except:
136                log.error(f"Input file '{input} in bad format")
137                raise ValueError(f"Input file '{input} in bad format")
138        else:
139            self.input = input
140
141        # Input format
142        if input:
143            input_name, input_extension = os.path.splitext(self.input)
144            self.input_name = input_name
145            self.input_extension = input_extension
146            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
148    def set_config(self, config: dict) -> None:
149        """
150        The set_config function takes a config object and assigns it as the configuration object for the
151        class.
152
153        :param config: The `config` parameter in the `set_config` function is a dictionary object that
154        contains configuration settings for the class. When you call the `set_config` function with a
155        dictionary object as the argument, it will set that dictionary as the configuration object for
156        the class
157        :type config: dict
158        """
159
160        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
162    def set_param(self, param: dict) -> None:
163        """
164        This function sets a parameter object for the class based on the input dictionary.
165
166        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
167        as the `param` attribute of the class instance
168        :type param: dict
169        """
170
171        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
173    def init_variables(self) -> None:
174        """
175        This function initializes the variables that will be used in the rest of the class
176        """
177
178        self.prefix = "howard"
179        self.table_variants = "variants"
180        self.dataframe = None
181
182        self.comparison_map = {
183            "gt": ">",
184            "gte": ">=",
185            "lt": "<",
186            "lte": "<=",
187            "equals": "=",
188            "contains": "SIMILAR TO",
189        }
190
191        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
192
193        self.code_type_map_to_sql = {
194            "Integer": "INTEGER",
195            "String": "VARCHAR",
196            "Float": "FLOAT",
197            "Flag": "VARCHAR",
198        }
199
200        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
202    def get_indexing(self) -> bool:
203        """
204        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
205        returns False.
206        :return: The value of the indexing parameter.
207        """
208
209        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
211    def get_connexion_config(self) -> dict:
212        """
213        The function `get_connexion_config` returns a dictionary containing the configuration for a
214        connection, including the number of threads and memory limit.
215        :return: a dictionary containing the configuration for the Connexion library.
216        """
217
218        # config
219        config = self.get_config()
220
221        # Connexion config
222        connexion_config = {}
223        threads = self.get_threads()
224
225        # Threads
226        if threads:
227            connexion_config["threads"] = threads
228
229        # Memory
230        # if config.get("memory", None):
231        #     connexion_config["memory_limit"] = config.get("memory")
232        if self.get_memory():
233            connexion_config["memory_limit"] = self.get_memory()
234
235        # Temporary directory
236        if config.get("tmp", None):
237            connexion_config["temp_directory"] = config.get("tmp")
238
239        # Access
240        if config.get("access", None):
241            access = config.get("access")
242            if access in ["RO"]:
243                access = "READ_ONLY"
244            elif access in ["RW"]:
245                access = "READ_WRITE"
246            connexion_db = self.get_connexion_db()
247            if connexion_db in ":memory:":
248                access = "READ_WRITE"
249            connexion_config["access_mode"] = access
250
251        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
253    def get_duckdb_settings(self) -> dict:
254        """
255        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
256        string.
257        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
258        """
259
260        # config
261        config = self.get_config()
262
263        # duckdb settings
264        duckdb_settings_dict = {}
265        if config.get("duckdb_settings", None):
266            duckdb_settings = config.get("duckdb_settings")
267            duckdb_settings = full_path(duckdb_settings)
268            # duckdb setting is a file
269            if os.path.exists(duckdb_settings):
270                with open(duckdb_settings) as json_file:
271                    duckdb_settings_dict = yaml.safe_load(json_file)
272            # duckdb settings is a string
273            else:
274                duckdb_settings_dict = json.loads(duckdb_settings)
275
276        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
278    def set_connexion_db(self) -> str:
279        """
280        The function `set_connexion_db` returns the appropriate database connection string based on the
281        input format and connection type.
282        :return: the value of the variable `connexion_db`.
283        """
284
285        # Default connexion db
286        default_connexion_db = ":memory:"
287
288        # Find connexion db
289        if self.get_input_format() in ["db", "duckdb"]:
290            connexion_db = self.get_input()
291        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
292            connexion_db = default_connexion_db
293        elif self.get_connexion_type() in ["tmpfile"]:
294            tmp_name = tempfile.mkdtemp(
295                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
296            )
297            connexion_db = f"{tmp_name}/tmp.db"
298        elif self.get_connexion_type() != "":
299            connexion_db = self.get_connexion_type()
300        else:
301            connexion_db = default_connexion_db
302
303        # Set connexion db
304        self.connexion_db = connexion_db
305
306        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
308    def set_connexion(self, conn) -> None:
309        """
310        The function `set_connexion` creates a connection to a database, with options for different
311        database formats and settings.
312
313        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
314        database. If a connection is not provided, a new connection to an in-memory database is created.
315        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
316        sqlite
317        """
318
319        # Connexion db
320        connexion_db = self.set_connexion_db()
321
322        # Connexion config
323        connexion_config = self.get_connexion_config()
324
325        # Connexion format
326        connexion_format = self.get_config().get("connexion_format", "duckdb")
327        # Set connexion format
328        self.connexion_format = connexion_format
329
330        # Connexion
331        if not conn:
332            if connexion_format in ["duckdb"]:
333                conn = duckdb.connect(connexion_db, config=connexion_config)
334                # duckDB settings
335                duckdb_settings = self.get_duckdb_settings()
336                if duckdb_settings:
337                    for setting in duckdb_settings:
338                        setting_value = duckdb_settings.get(setting)
339                        if isinstance(setting_value, str):
340                            setting_value = f"'{setting_value}'"
341                        conn.execute(f"PRAGMA {setting}={setting_value};")
342            elif connexion_format in ["sqlite"]:
343                conn = sqlite3.connect(connexion_db)
344
345        # Set connexion
346        self.conn = conn
347
348        # Log
349        log.debug(f"connexion_format: {connexion_format}")
350        log.debug(f"connexion_db: {connexion_db}")
351        log.debug(f"connexion config: {connexion_config}")
352        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
354    def set_output(self, output: str = None) -> None:
355        """
356        The `set_output` function in Python sets the output file based on the input or a specified key
357        in the config file, extracting the output name, extension, and format.
358
359        :param output: The `output` parameter in the `set_output` method is used to specify the name of
360        the output file. If the config file has an 'output' key, the method sets the output to the value
361        of that key. If no output is provided, it sets the output to `None`
362        :type output: str
363        """
364
365        if output and not isinstance(output, str):
366            self.output = output.name
367        else:
368            self.output = output
369
370        # Output format
371        if self.output:
372            output_name, output_extension = os.path.splitext(self.output)
373            self.output_name = output_name
374            self.output_extension = output_extension
375            self.output_format = self.output_extension.replace(".", "")
376        else:
377            self.output_name = None
378            self.output_extension = None
379            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
381    def set_header(self) -> None:
382        """
383        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
384        """
385
386        input_file = self.get_input()
387        default_header_list = [
388            "##fileformat=VCFv4.2",
389            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
390        ]
391
392        # Full path
393        input_file = full_path(input_file)
394
395        if input_file:
396
397            input_format = self.get_input_format()
398            input_compressed = self.get_input_compressed()
399            config = self.get_config()
400            header_list = default_header_list
401            if input_format in [
402                "vcf",
403                "hdr",
404                "tsv",
405                "csv",
406                "psv",
407                "parquet",
408                "db",
409                "duckdb",
410            ]:
411                # header provided in param
412                if config.get("header_file", None):
413                    with open(config.get("header_file"), "rt") as f:
414                        header_list = self.read_vcf_header(f)
415                # within a vcf file format (header within input file itsself)
416                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
417                    # within a compressed vcf file format (.vcf.gz)
418                    if input_compressed:
419                        with bgzf.open(input_file, "rt") as f:
420                            header_list = self.read_vcf_header(f)
421                    # within an uncompressed vcf file format (.vcf)
422                    else:
423                        with open(input_file, "rt") as f:
424                            header_list = self.read_vcf_header(f)
425                # header provided in default external file .hdr
426                elif os.path.exists((input_file + ".hdr")):
427                    with open(input_file + ".hdr", "rt") as f:
428                        header_list = self.read_vcf_header(f)
429                else:
430                    try:  # Try to get header info fields and file columns
431
432                        with tempfile.TemporaryDirectory() as tmpdir:
433
434                            # Create database
435                            db_for_header = Database(database=input_file)
436
437                            # Get header columns for infos fields
438                            db_header_from_columns = (
439                                db_for_header.get_header_from_columns()
440                            )
441
442                            # Get real columns in the file
443                            db_header_columns = db_for_header.get_columns()
444
445                            # Write header file
446                            header_file_tmp = os.path.join(tmpdir, "header")
447                            f = open(header_file_tmp, "w")
448                            vcf.Writer(f, db_header_from_columns)
449                            f.close()
450
451                            # Replace #CHROM line with rel columns
452                            header_list = db_for_header.read_header_file(
453                                header_file=header_file_tmp
454                            )
455                            header_list[-1] = "\t".join(db_header_columns)
456
457                    except:
458
459                        log.warning(
460                            f"No header for file {input_file}. Set as default VCF header"
461                        )
462                        header_list = default_header_list
463
464            else:  # try for unknown format ?
465
466                log.error(f"Input file format '{input_format}' not available")
467                raise ValueError(f"Input file format '{input_format}' not available")
468
469            if not header_list:
470                header_list = default_header_list
471
472            # header as list
473            self.header_list = header_list
474
475            # header as VCF object
476            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
477
478        else:
479
480            self.header_list = None
481            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
483    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
484        """
485        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
486        DataFrame based on the connection format.
487
488        :param query: The `query` parameter in the `get_query_to_df` function is a string that
489        represents the SQL query you want to execute. This query will be used to fetch data from a
490        database and convert it into a pandas DataFrame
491        :type query: str
492        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
493        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
494        function will only fetch up to that number of rows from the database query result. If no limit
495        is specified,
496        :type limit: int
497        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
498        """
499
500        # Connexion format
501        connexion_format = self.get_connexion_format()
502
503        # Limit in query
504        if limit:
505            pd.set_option("display.max_rows", limit)
506            if connexion_format in ["duckdb"]:
507                df = (
508                    self.conn.execute(query)
509                    .fetch_record_batch(limit)
510                    .read_next_batch()
511                    .to_pandas()
512                )
513            elif connexion_format in ["sqlite"]:
514                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
515
516        # Full query
517        else:
518            if connexion_format in ["duckdb"]:
519                df = self.conn.execute(query).df()
520            elif connexion_format in ["sqlite"]:
521                df = pd.read_sql_query(query, self.conn)
522
523        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
525    def get_overview(self) -> None:
526        """
527        The function prints the input, output, config, and dataframe of the current object
528        """
529        table_variants_from = self.get_table_variants(clause="from")
530        sql_columns = self.get_header_columns_as_sql()
531        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
532        df = self.get_query_to_df(sql_query_export)
533        log.info(
534            "Input:  "
535            + str(self.get_input())
536            + " ["
537            + str(str(self.get_input_format()))
538            + "]"
539        )
540        log.info(
541            "Output: "
542            + str(self.get_output())
543            + " ["
544            + str(str(self.get_output_format()))
545            + "]"
546        )
547        log.info("Config: ")
548        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
549            "\n"
550        ):
551            log.info("\t" + str(d))
552        log.info("Param: ")
553        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
554            "\n"
555        ):
556            log.info("\t" + str(d))
557        log.info("Sample list: " + str(self.get_header_sample_list()))
558        log.info("Dataframe: ")
559        for d in str(df).split("\n"):
560            log.info("\t" + str(d))
561
562        # garbage collector
563        del df
564        gc.collect()
565
566        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
568    def get_stats(self) -> dict:
569        """
570        The `get_stats` function calculates and returns various statistics of the current object,
571        including information about the input file, variants, samples, header fields, quality, and
572        SNVs/InDels.
573        :return: a dictionary containing various statistics of the current object. The dictionary has
574        the following structure:
575        """
576
577        # Log
578        log.info(f"Stats Calculation...")
579
580        # table varaints
581        table_variants_from = self.get_table_variants()
582
583        # stats dict
584        stats = {"Infos": {}}
585
586        ### File
587        input_file = self.get_input()
588        stats["Infos"]["Input file"] = input_file
589
590        # Header
591        header_infos = self.get_header().infos
592        header_formats = self.get_header().formats
593        header_infos_list = list(header_infos)
594        header_formats_list = list(header_formats)
595
596        ### Variants
597
598        stats["Variants"] = {}
599
600        # Variants by chr
601        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
602        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
603        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
604            by=["CHROM"], kind="quicksort"
605        )
606
607        # Total number of variants
608        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
609
610        # Calculate percentage
611        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
612            lambda x: (x / nb_of_variants)
613        )
614
615        stats["Variants"]["Number of variants by chromosome"] = (
616            nb_of_variants_by_chrom.to_dict(orient="index")
617        )
618
619        stats["Infos"]["Number of variants"] = int(nb_of_variants)
620
621        ### Samples
622
623        # Init
624        samples = {}
625        nb_of_samples = 0
626
627        # Check Samples
628        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
629            log.debug(f"Check samples...")
630            for sample in self.get_header_sample_list():
631                sql_query_samples = f"""
632                    SELECT  '{sample}' as sample,
633                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
634                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
635                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
636                    FROM {table_variants_from}
637                    WHERE (
638                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
639                        AND
640                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
641                      )
642                    GROUP BY genotype
643                    """
644                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
645                sample_genotype_count = sql_query_genotype_df["count"].sum()
646                if len(sql_query_genotype_df):
647                    nb_of_samples += 1
648                    samples[f"{sample} - {sample_genotype_count} variants"] = (
649                        sql_query_genotype_df.to_dict(orient="index")
650                    )
651
652            stats["Samples"] = samples
653            stats["Infos"]["Number of samples"] = nb_of_samples
654
655        # #
656        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
657        #     stats["Infos"]["Number of samples"] = nb_of_samples
658        # elif nb_of_samples:
659        #     stats["Infos"]["Number of samples"] = "not a VCF format"
660
661        ### INFO and FORMAT fields
662        header_types_df = {}
663        header_types_list = {
664            "List of INFO fields": header_infos,
665            "List of FORMAT fields": header_formats,
666        }
667        i = 0
668        for header_type in header_types_list:
669
670            header_type_infos = header_types_list.get(header_type)
671            header_infos_dict = {}
672
673            for info in header_type_infos:
674
675                i += 1
676                header_infos_dict[i] = {}
677
678                # ID
679                header_infos_dict[i]["id"] = info
680
681                # num
682                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
683                if header_type_infos[info].num in genotype_map.keys():
684                    header_infos_dict[i]["Number"] = genotype_map.get(
685                        header_type_infos[info].num
686                    )
687                else:
688                    header_infos_dict[i]["Number"] = header_type_infos[info].num
689
690                # type
691                if header_type_infos[info].type:
692                    header_infos_dict[i]["Type"] = header_type_infos[info].type
693                else:
694                    header_infos_dict[i]["Type"] = "."
695
696                # desc
697                if header_type_infos[info].desc != None:
698                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
699                else:
700                    header_infos_dict[i]["Description"] = ""
701
702            if len(header_infos_dict):
703                header_types_df[header_type] = pd.DataFrame.from_dict(
704                    header_infos_dict, orient="index"
705                ).to_dict(orient="index")
706
707        # Stats
708        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
709        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
710        stats["Header"] = header_types_df
711
712        ### QUAL
713        if "QUAL" in self.get_header_columns():
714            sql_query_qual = f"""
715                    SELECT
716                        avg(CAST(QUAL AS INTEGER)) AS Average,
717                        min(CAST(QUAL AS INTEGER)) AS Minimum,
718                        max(CAST(QUAL AS INTEGER)) AS Maximum,
719                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
720                        median(CAST(QUAL AS INTEGER)) AS Median,
721                        variance(CAST(QUAL AS INTEGER)) AS Variance
722                    FROM {table_variants_from}
723                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
724                    """
725
726            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
727            stats["Quality"] = {"Stats": qual}
728
729        ### SNV and InDel
730
731        sql_query_snv = f"""
732            
733            SELECT Type, count FROM (
734
735                    SELECT
736                        'Total' AS Type,
737                        count(*) AS count
738                    FROM {table_variants_from}
739
740                    UNION
741
742                    SELECT
743                        'MNV' AS Type,
744                        count(*) AS count
745                    FROM {table_variants_from}
746                    WHERE len(REF) > 1 AND len(ALT) > 1
747                    AND len(REF) = len(ALT)
748
749                    UNION
750
751                    SELECT
752                        'InDel' AS Type,
753                        count(*) AS count
754                    FROM {table_variants_from}
755                    WHERE len(REF) > 1 OR len(ALT) > 1
756                    AND len(REF) != len(ALT)
757                    
758                    UNION
759
760                    SELECT
761                        'SNV' AS Type,
762                        count(*) AS count
763                    FROM {table_variants_from}
764                    WHERE len(REF) = 1 AND len(ALT) = 1
765
766                )
767
768            ORDER BY count DESC
769
770                """
771        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
772
773        sql_query_snv_substitution = f"""
774                SELECT
775                    concat(REF, '>', ALT) AS 'Substitution',
776                    count(*) AS count
777                FROM {table_variants_from}
778                WHERE len(REF) = 1 AND len(ALT) = 1
779                GROUP BY REF, ALT
780                ORDER BY count(*) DESC
781                """
782        snv_substitution = (
783            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
784        )
785        stats["Variants"]["Counts"] = snv_indel
786        stats["Variants"]["Substitutions"] = snv_substitution
787
788        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
790    def stats_to_file(self, file: str = None) -> str:
791        """
792        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
793        into a JSON object, and writes the JSON object to the specified file.
794
795        :param file: The `file` parameter is a string that represents the file path where the JSON data
796        will be written
797        :type file: str
798        :return: the name of the file that was written to.
799        """
800
801        # Get stats
802        stats = self.get_stats()
803
804        # Serializing json
805        json_object = json.dumps(stats, indent=4)
806
807        # Writing to sample.json
808        with open(file, "w") as outfile:
809            outfile.write(json_object)
810
811        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
813    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
814        """
815        The `print_stats` function generates a markdown file and prints the statistics contained in a
816        JSON file in a formatted manner.
817
818        :param output_file: The `output_file` parameter is a string that specifies the path and filename
819        of the output file where the stats will be printed in Markdown format. If no `output_file` is
820        provided, a temporary directory will be created and the stats will be saved in a file named
821        "stats.md" within that
822        :type output_file: str
823        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
824        file where the statistics will be saved. If no value is provided, a temporary directory will be
825        created and a default file name "stats.json" will be used
826        :type json_file: str
827        :return: The function `print_stats` does not return any value. It has a return type annotation
828        of `None`.
829        """
830
831        # Full path
832        output_file = full_path(output_file)
833        json_file = full_path(json_file)
834
835        with tempfile.TemporaryDirectory() as tmpdir:
836
837            # Files
838            if not output_file:
839                output_file = os.path.join(tmpdir, "stats.md")
840            if not json_file:
841                json_file = os.path.join(tmpdir, "stats.json")
842
843            # Create folders
844            if not os.path.exists(os.path.dirname(output_file)):
845                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
846            if not os.path.exists(os.path.dirname(json_file)):
847                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
848
849            # Create stats JSON file
850            stats_file = self.stats_to_file(file=json_file)
851
852            # Print stats file
853            with open(stats_file) as f:
854                stats = yaml.safe_load(f)
855
856            # Output
857            output_title = []
858            output_index = []
859            output = []
860
861            # Title
862            output_title.append("# HOWARD Stats")
863
864            # Index
865            output_index.append("## Index")
866
867            # Process sections
868            for section in stats:
869                infos = stats.get(section)
870                section_link = "#" + section.lower().replace(" ", "-")
871                output.append(f"## {section}")
872                output_index.append(f"- [{section}]({section_link})")
873
874                if len(infos):
875                    for info in infos:
876                        try:
877                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
878                            is_df = True
879                        except:
880                            try:
881                                df = pd.DataFrame.from_dict(
882                                    json.loads((infos.get(info))), orient="index"
883                                )
884                                is_df = True
885                            except:
886                                is_df = False
887                        if is_df:
888                            output.append(f"### {info}")
889                            info_link = "#" + info.lower().replace(" ", "-")
890                            output_index.append(f"   - [{info}]({info_link})")
891                            output.append(f"{df.to_markdown(index=False)}")
892                        else:
893                            output.append(f"- {info}: {infos.get(info)}")
894                else:
895                    output.append(f"NA")
896
897            # Write stats in markdown file
898            with open(output_file, "w") as fp:
899                for item in output_title:
900                    fp.write("%s\n" % item)
901                for item in output_index:
902                    fp.write("%s\n" % item)
903                for item in output:
904                    fp.write("%s\n" % item)
905
906            # Output stats in markdown
907            print("")
908            print("\n\n".join(output_title))
909            print("")
910            print("\n\n".join(output))
911            print("")
912
913        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
915    def get_input(self) -> str:
916        """
917        It returns the value of the input variable.
918        :return: The input is being returned.
919        """
920        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
922    def get_input_format(self, input_file: str = None) -> str:
923        """
924        This function returns the format of the input variable, either from the provided input file or
925        by prompting for input.
926
927        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
928        represents the file path of the input file. If no `input_file` is provided when calling the
929        method, it will default to `None`
930        :type input_file: str
931        :return: The format of the input variable is being returned.
932        """
933
934        if not input_file:
935            input_file = self.get_input()
936        input_format = get_file_format(input_file)
937        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
939    def get_input_compressed(self, input_file: str = None) -> str:
940        """
941        The function `get_input_compressed` returns the format of the input variable after compressing
942        it.
943
944        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
945        that represents the file path of the input file. If no `input_file` is provided when calling the
946        method, it will default to `None` and the method will then call `self.get_input()` to
947        :type input_file: str
948        :return: The function `get_input_compressed` returns the compressed format of the input
949        variable.
950        """
951
952        if not input_file:
953            input_file = self.get_input()
954        input_compressed = get_file_compressed(input_file)
955        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
957    def get_output(self) -> str:
958        """
959        It returns the output of the neuron.
960        :return: The output of the neural network.
961        """
962
963        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
965    def get_output_format(self, output_file: str = None) -> str:
966        """
967        The function `get_output_format` returns the format of the input variable or the output file if
968        provided.
969
970        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
971        that represents the file path of the output file. If no `output_file` is provided when calling
972        the method, it will default to the output obtained from the `get_output` method of the class
973        instance. The
974        :type output_file: str
975        :return: The format of the input variable is being returned.
976        """
977
978        if not output_file:
979            output_file = self.get_output()
980        output_format = get_file_format(output_file)
981
982        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
984    def get_config(self) -> dict:
985        """
986        It returns the config
987        :return: The config variable is being returned.
988        """
989        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
991    def get_param(self) -> dict:
992        """
993        It returns the param
994        :return: The param variable is being returned.
995        """
996        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
 998    def get_connexion_db(self) -> str:
 999        """
1000        It returns the connexion_db attribute of the object
1001        :return: The connexion_db is being returned.
1002        """
1003        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1005    def get_prefix(self) -> str:
1006        """
1007        It returns the prefix of the object.
1008        :return: The prefix is being returned.
1009        """
1010        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1012    def get_table_variants(self, clause: str = "select") -> str:
1013        """
1014        This function returns the table_variants attribute of the object
1015
1016        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1017        defaults to select (optional)
1018        :return: The table_variants attribute of the object.
1019        """
1020
1021        # Access
1022        access = self.get_config().get("access", None)
1023
1024        # Clauses "select", "where", "update"
1025        if clause in ["select", "where", "update"]:
1026            table_variants = self.table_variants
1027        # Clause "from"
1028        elif clause in ["from"]:
1029            # For Read Only
1030            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1031                input_file = self.get_input()
1032                table_variants = f"'{input_file}' as variants"
1033            # For Read Write
1034            else:
1035                table_variants = f"{self.table_variants} as variants"
1036        else:
1037            table_variants = self.table_variants
1038        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1040    def get_tmp_dir(self) -> str:
1041        """
1042        The function `get_tmp_dir` returns the temporary directory path based on configuration
1043        parameters or a default path.
1044        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1045        configuration, parameters, and a default value of "/tmp".
1046        """
1047
1048        return get_tmp(
1049            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1050        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1052    def get_connexion_type(self) -> str:
1053        """
1054        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1055
1056        :return: The connexion type is being returned.
1057        """
1058        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1060    def get_connexion(self):
1061        """
1062        It returns the connection object
1063
1064        :return: The connection object.
1065        """
1066        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1068    def close_connexion(self) -> None:
1069        """
1070        This function closes the connection to the database.
1071        :return: The connection is being closed.
1072        """
1073        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1075    def get_header(self, type: str = "vcf"):
1076        """
1077        This function returns the header of the VCF file as a list of strings
1078
1079        :param type: the type of header you want to get, defaults to vcf (optional)
1080        :return: The header of the vcf file.
1081        """
1082
1083        if self.header_vcf:
1084            if type == "vcf":
1085                return self.header_vcf
1086            elif type == "list":
1087                return self.header_list
1088        else:
1089            if type == "vcf":
1090                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1091                return header
1092            elif type == "list":
1093                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1095    def get_header_length(self, file: str = None) -> int:
1096        """
1097        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1098        line.
1099
1100        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1101        header file. If this argument is provided, the function will read the header from the specified
1102        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1103        :type file: str
1104        :return: the length of the header list, excluding the #CHROM line.
1105        """
1106
1107        if file:
1108            return len(self.read_vcf_header_file(file=file)) - 1
1109        elif self.get_header(type="list"):
1110            return len(self.get_header(type="list")) - 1
1111        else:
1112            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1114    def get_header_columns(self) -> str:
1115        """
1116        This function returns the header list of a VCF
1117
1118        :return: The length of the header list.
1119        """
1120        if self.get_header():
1121            return self.get_header(type="list")[-1]
1122        else:
1123            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1125    def get_header_columns_as_list(self) -> list:
1126        """
1127        This function returns the header list of a VCF
1128
1129        :return: The length of the header list.
1130        """
1131        if self.get_header():
1132            return self.get_header_columns().strip().split("\t")
1133        else:
1134            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1136    def get_header_columns_as_sql(self) -> str:
1137        """
1138        This function retruns header length (without #CHROM line)
1139
1140        :return: The length of the header list.
1141        """
1142        sql_column_list = []
1143        for col in self.get_header_columns_as_list():
1144            sql_column_list.append(f'"{col}"')
1145        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1147    def get_header_sample_list(
1148        self, check: bool = False, samples: list = None, samples_force: bool = False
1149    ) -> list:
1150        """
1151        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1152        checking and filtering based on input parameters.
1153
1154        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1155        parameter that determines whether to check if the samples in the list are properly defined as
1156        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1157        list is defined as a, defaults to False
1158        :type check: bool (optional)
1159        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1160        allows you to specify a subset of samples from the header. If you provide a list of sample
1161        names, the function will check if each sample is defined in the header. If a sample is not found
1162        in the
1163        :type samples: list
1164        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1165        a boolean parameter that determines whether to force the function to return the sample list
1166        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1167        function will return the sample list without performing, defaults to False
1168        :type samples_force: bool (optional)
1169        :return: The function `get_header_sample_list` returns a list of samples based on the input
1170        parameters and conditions specified in the function.
1171        """
1172
1173        # Init
1174        samples_list = []
1175
1176        if samples is None:
1177            samples_list = self.header_vcf.samples
1178        else:
1179            samples_checked = []
1180            for sample in samples:
1181                if sample in self.header_vcf.samples:
1182                    samples_checked.append(sample)
1183                else:
1184                    log.warning(f"Sample '{sample}' not defined in header")
1185            samples_list = samples_checked
1186
1187            # Force sample list without checking if is_genotype_column
1188            if samples_force:
1189                log.warning(f"Samples {samples_list} not checked if genotypes")
1190                return samples_list
1191
1192        if check:
1193            samples_checked = []
1194            for sample in samples_list:
1195                if self.is_genotype_column(column=sample):
1196                    samples_checked.append(sample)
1197                else:
1198                    log.warning(
1199                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1200                    )
1201            samples_list = samples_checked
1202
1203        # Return samples list
1204        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1206    def is_genotype_column(self, column: str = None) -> bool:
1207        """
1208        This function checks if a given column is a genotype column in a database.
1209
1210        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1211        represents the column name in a database table. This method checks if the specified column is a
1212        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1213        method of
1214        :type column: str
1215        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1216        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1217        column name and returns the result. If the `column` parameter is None, it returns False.
1218        """
1219
1220        if column is not None:
1221            return Database(database=self.get_input()).is_genotype_column(column=column)
1222        else:
1223            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1225    def get_verbose(self) -> bool:
1226        """
1227        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1228        exist
1229
1230        :return: The value of the key "verbose" in the config dictionary.
1231        """
1232        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1234    def get_connexion_format(self) -> str:
1235        """
1236        It returns the connexion format of the object.
1237        :return: The connexion_format is being returned.
1238        """
1239        connexion_format = self.connexion_format
1240        if connexion_format not in ["duckdb", "sqlite"]:
1241            log.error(f"Unknown connexion format {connexion_format}")
1242            raise ValueError(f"Unknown connexion format {connexion_format}")
1243        else:
1244            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1246    def insert_file_to_table(
1247        self,
1248        file,
1249        columns: str,
1250        header_len: int = 0,
1251        sep: str = "\t",
1252        chunksize: int = 1000000,
1253    ) -> None:
1254        """
1255        The function reads a file in chunks and inserts each chunk into a table based on the specified
1256        database format.
1257
1258        :param file: The `file` parameter is the file that you want to load into a table. It should be
1259        the path to the file on your system
1260        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1261        should contain the names of the columns in the table where the data will be inserted. The column
1262        names should be separated by commas within the string. For example, if you have columns named
1263        "id", "name
1264        :type columns: str
1265        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1266        the number of lines to skip at the beginning of the file before reading the actual data. This
1267        parameter allows you to skip any header information present in the file before processing the
1268        data, defaults to 0
1269        :type header_len: int (optional)
1270        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1271        separator character that is used in the file being read. In this case, the default separator is
1272        set to `\t`, which represents a tab character. You can change this parameter to a different
1273        separator character if, defaults to \t
1274        :type sep: str (optional)
1275        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1276        when processing the file in chunks. In the provided code snippet, the default value for
1277        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1278        to 1000000
1279        :type chunksize: int (optional)
1280        """
1281
1282        # Config
1283        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1284        connexion_format = self.get_connexion_format()
1285
1286        log.debug("chunksize: " + str(chunksize))
1287
1288        if chunksize:
1289            for chunk in pd.read_csv(
1290                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1291            ):
1292                if connexion_format in ["duckdb"]:
1293                    sql_insert_into = (
1294                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1295                    )
1296                    self.conn.execute(sql_insert_into)
1297                elif connexion_format in ["sqlite"]:
1298                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1300    def load_data(
1301        self,
1302        input_file: str = None,
1303        drop_variants_table: bool = False,
1304        sample_size: int = 20480,
1305    ) -> None:
1306        """
1307        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1308        table before loading the data and specify a sample size.
1309
1310        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1311        table
1312        :type input_file: str
1313        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1314        determines whether the variants table should be dropped before loading the data. If set to
1315        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1316        not be dropped, defaults to False
1317        :type drop_variants_table: bool (optional)
1318        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1319        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1320        20480
1321        :type sample_size: int (optional)
1322        """
1323
1324        log.info("Loading...")
1325
1326        # change input file
1327        if input_file:
1328            self.set_input(input_file)
1329            self.set_header()
1330
1331        # drop variants table
1332        if drop_variants_table:
1333            self.drop_variants_table()
1334
1335        # get table variants
1336        table_variants = self.get_table_variants()
1337
1338        # Access
1339        access = self.get_config().get("access", None)
1340        log.debug(f"access: {access}")
1341
1342        # Input format and compress
1343        input_format = self.get_input_format()
1344        input_compressed = self.get_input_compressed()
1345        log.debug(f"input_format: {input_format}")
1346        log.debug(f"input_compressed: {input_compressed}")
1347
1348        # input_compressed_format
1349        if input_compressed:
1350            input_compressed_format = "gzip"
1351        else:
1352            input_compressed_format = "none"
1353        log.debug(f"input_compressed_format: {input_compressed_format}")
1354
1355        # Connexion format
1356        connexion_format = self.get_connexion_format()
1357
1358        # Sample size
1359        if not sample_size:
1360            sample_size = -1
1361        log.debug(f"sample_size: {sample_size}")
1362
1363        # Load data
1364        log.debug(f"Load Data from {input_format}")
1365
1366        # DuckDB connexion
1367        if connexion_format in ["duckdb"]:
1368
1369            # Database already exists
1370            if self.input_format in ["db", "duckdb"]:
1371
1372                if connexion_format in ["duckdb"]:
1373                    log.debug(f"Input file format '{self.input_format}' duckDB")
1374                else:
1375                    log.error(
1376                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1377                    )
1378                    raise ValueError(
1379                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1380                    )
1381
1382            # Load from existing database format
1383            else:
1384
1385                try:
1386                    # Create Table or View
1387                    database = Database(database=self.input)
1388                    sql_from = database.get_sql_from(sample_size=sample_size)
1389
1390                    if access in ["RO"]:
1391                        sql_load = (
1392                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1393                        )
1394                    else:
1395                        sql_load = (
1396                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1397                        )
1398                    self.conn.execute(sql_load)
1399
1400                except:
1401                    # Format not available
1402                    log.error(f"Input file format '{self.input_format}' not available")
1403                    raise ValueError(
1404                        f"Input file format '{self.input_format}' not available"
1405                    )
1406
1407        # SQLite connexion
1408        elif connexion_format in ["sqlite"] and input_format in [
1409            "vcf",
1410            "tsv",
1411            "csv",
1412            "psv",
1413        ]:
1414
1415            # Main structure
1416            structure = {
1417                "#CHROM": "VARCHAR",
1418                "POS": "INTEGER",
1419                "ID": "VARCHAR",
1420                "REF": "VARCHAR",
1421                "ALT": "VARCHAR",
1422                "QUAL": "VARCHAR",
1423                "FILTER": "VARCHAR",
1424                "INFO": "VARCHAR",
1425            }
1426
1427            # Strcuture with samples
1428            structure_complete = structure
1429            if self.get_header_sample_list():
1430                structure["FORMAT"] = "VARCHAR"
1431                for sample in self.get_header_sample_list():
1432                    structure_complete[sample] = "VARCHAR"
1433
1434            # Columns list for create and insert
1435            sql_create_table_columns = []
1436            sql_create_table_columns_list = []
1437            for column in structure_complete:
1438                column_type = structure_complete[column]
1439                sql_create_table_columns.append(
1440                    f'"{column}" {column_type} default NULL'
1441                )
1442                sql_create_table_columns_list.append(f'"{column}"')
1443
1444            # Create database
1445            log.debug(f"Create Table {table_variants}")
1446            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1447            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1448            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1449            self.conn.execute(sql_create_table)
1450
1451            # chunksize define length of file chunk load file
1452            chunksize = 100000
1453
1454            # delimiter
1455            delimiter = file_format_delimiters.get(input_format, "\t")
1456
1457            # Load the input file
1458            with open(self.input, "rt") as input_file:
1459
1460                # Use the appropriate file handler based on the input format
1461                if input_compressed:
1462                    input_file = bgzf.open(self.input, "rt")
1463                if input_format in ["vcf"]:
1464                    header_len = self.get_header_length()
1465                else:
1466                    header_len = 0
1467
1468                # Insert the file contents into a table
1469                self.insert_file_to_table(
1470                    input_file,
1471                    columns=sql_create_table_columns_list_sql,
1472                    header_len=header_len,
1473                    sep=delimiter,
1474                    chunksize=chunksize,
1475                )
1476
1477        else:
1478            log.error(
1479                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1480            )
1481            raise ValueError(
1482                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1483            )
1484
1485        # Explode INFOS fields into table fields
1486        if self.get_explode_infos():
1487            self.explode_infos(
1488                prefix=self.get_explode_infos_prefix(),
1489                fields=self.get_explode_infos_fields(),
1490                force=True,
1491            )
1492
1493        # Create index after insertion
1494        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1496    def get_explode_infos(self) -> bool:
1497        """
1498        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1499        to False if it is not set.
1500        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1501        value. If the parameter is not present, it will return False.
1502        """
1503
1504        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1506    def get_explode_infos_fields(
1507        self,
1508        explode_infos_fields: str = None,
1509        remove_fields_not_in_header: bool = False,
1510    ) -> list:
1511        """
1512        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1513        the input parameter `explode_infos_fields`.
1514
1515        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1516        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1517        comma-separated list of field names to explode
1518        :type explode_infos_fields: str
1519        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1520        flag that determines whether to remove fields that are not present in the header. If it is set
1521        to `True`, any field that is not in the header will be excluded from the list of exploded
1522        information fields. If it is set to `, defaults to False
1523        :type remove_fields_not_in_header: bool (optional)
1524        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1525        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1526        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1527        Otherwise, it returns a list of exploded information fields after removing any spaces and
1528        splitting the string by commas.
1529        """
1530
1531        # If no fields, get it in param
1532        if not explode_infos_fields:
1533            explode_infos_fields = (
1534                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1535            )
1536
1537        # If no fields, defined as all fields in header using keyword
1538        if not explode_infos_fields:
1539            explode_infos_fields = "*"
1540
1541        # If fields list not empty
1542        if explode_infos_fields:
1543
1544            # Input fields list
1545            if isinstance(explode_infos_fields, str):
1546                fields_input = explode_infos_fields.split(",")
1547            elif isinstance(explode_infos_fields, list):
1548                fields_input = explode_infos_fields
1549            else:
1550                fields_input = []
1551
1552            # Fields list without * keyword
1553            fields_without_all = fields_input.copy()
1554            if "*".casefold() in (item.casefold() for item in fields_without_all):
1555                fields_without_all.remove("*")
1556
1557            # Fields in header
1558            fields_in_header = sorted(list(set(self.get_header().infos)))
1559
1560            # Construct list of fields
1561            fields_output = []
1562            for field in fields_input:
1563
1564                # Strip field
1565                field = field.strip()
1566
1567                # format keyword * in regex
1568                if field.upper() in ["*"]:
1569                    field = ".*"
1570
1571                # Find all fields with pattern
1572                r = re.compile(field)
1573                fields_search = sorted(list(filter(r.match, fields_in_header)))
1574
1575                # Remove fields input from search
1576                if field in fields_search:
1577                    fields_search = [field]
1578                elif fields_search != [field]:
1579                    fields_search = sorted(
1580                        list(set(fields_search).difference(fields_input))
1581                    )
1582
1583                # If field is not in header (avoid not well formatted header)
1584                if not fields_search and not remove_fields_not_in_header:
1585                    fields_search = [field]
1586
1587                # Add found fields
1588                for new_field in fields_search:
1589                    # Add field, if not already exists, and if it is in header (if asked)
1590                    if (
1591                        new_field not in fields_output
1592                        and (
1593                            not remove_fields_not_in_header
1594                            or new_field in fields_in_header
1595                        )
1596                        and new_field not in [".*"]
1597                    ):
1598                        fields_output.append(new_field)
1599
1600            return fields_output
1601
1602        else:
1603
1604            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1606    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1607        """
1608        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1609        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1610        not provided.
1611
1612        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1613        prefix to be used for exploding or expanding information
1614        :type explode_infos_prefix: str
1615        :return: the value of the variable `explode_infos_prefix`.
1616        """
1617
1618        if not explode_infos_prefix:
1619            explode_infos_prefix = (
1620                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1621            )
1622
1623        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1625    def add_column(
1626        self,
1627        table_name,
1628        column_name,
1629        column_type,
1630        default_value=None,
1631        drop: bool = False,
1632    ) -> dict:
1633        """
1634        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1635        doesn't already exist.
1636
1637        :param table_name: The name of the table to which you want to add a column
1638        :param column_name: The parameter "column_name" is the name of the column that you want to add
1639        to the table
1640        :param column_type: The `column_type` parameter specifies the data type of the column that you
1641        want to add to the table. It should be a string that represents the desired data type, such as
1642        "INTEGER", "TEXT", "REAL", etc
1643        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1644        default value for the newly added column. If a default value is provided, it will be assigned to
1645        the column for any existing rows that do not have a value for that column
1646        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1647        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1648        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1649        to False
1650        :type drop: bool (optional)
1651        :return: a boolean value indicating whether the column was successfully added to the table.
1652        """
1653
1654        # added
1655        added = False
1656        dropped = False
1657
1658        # Check if the column already exists in the table
1659        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1660        columns = self.get_query_to_df(query).columns.tolist()
1661        if column_name.upper() in [c.upper() for c in columns]:
1662            log.debug(
1663                f"The {column_name} column already exists in the {table_name} table"
1664            )
1665            if drop:
1666                self.drop_column(table_name=table_name, column_name=column_name)
1667                dropped = True
1668            else:
1669                return None
1670        else:
1671            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1672
1673        # Add column in table
1674        add_column_query = (
1675            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1676        )
1677        if default_value is not None:
1678            add_column_query += f" DEFAULT {default_value}"
1679        self.execute_query(add_column_query)
1680        added = not dropped
1681        log.debug(
1682            f"The {column_name} column was successfully added to the {table_name} table"
1683        )
1684
1685        if added:
1686            added_column = {
1687                "table_name": table_name,
1688                "column_name": column_name,
1689                "column_type": column_type,
1690                "default_value": default_value,
1691            }
1692        else:
1693            added_column = None
1694
1695        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1697    def drop_column(
1698        self, column: dict = None, table_name: str = None, column_name: str = None
1699    ) -> bool:
1700        """
1701        The `drop_column` function drops a specified column from a given table in a database and returns
1702        True if the column was successfully dropped, and False if the column does not exist in the
1703        table.
1704
1705        :param column: The `column` parameter is a dictionary that contains information about the column
1706        you want to drop. It has two keys:
1707        :type column: dict
1708        :param table_name: The `table_name` parameter is the name of the table from which you want to
1709        drop a column
1710        :type table_name: str
1711        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1712        from the table
1713        :type column_name: str
1714        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1715        and False if the column does not exist in the table.
1716        """
1717
1718        # Find column infos
1719        if column:
1720            if isinstance(column, dict):
1721                table_name = column.get("table_name", None)
1722                column_name = column.get("column_name", None)
1723            elif isinstance(column, str):
1724                table_name = self.get_table_variants()
1725                column_name = column
1726            else:
1727                table_name = None
1728                column_name = None
1729
1730        if not table_name and not column_name:
1731            return False
1732
1733        # Removed
1734        removed = False
1735
1736        # Check if the column already exists in the table
1737        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1738        columns = self.get_query_to_df(query).columns.tolist()
1739        if column_name in columns:
1740            log.debug(f"The {column_name} column exists in the {table_name} table")
1741        else:
1742            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1743            return False
1744
1745        # Add column in table # ALTER TABLE integers DROP k
1746        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1747        self.execute_query(add_column_query)
1748        removed = True
1749        log.debug(
1750            f"The {column_name} column was successfully dropped to the {table_name} table"
1751        )
1752
1753        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1755    def explode_infos(
1756        self,
1757        prefix: str = None,
1758        create_index: bool = False,
1759        fields: list = None,
1760        force: bool = False,
1761        proccess_all_fields_together: bool = False,
1762        table: str = None,
1763    ) -> list:
1764        """
1765        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1766        individual columns, returning a list of added columns.
1767
1768        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1769        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1770        `self.get_explode_infos_prefix()` as the prefix
1771        :type prefix: str
1772        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1773        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1774        `False`, indexes will not be created. The default value is `False`, defaults to False
1775        :type create_index: bool (optional)
1776        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1777        that you want to explode into individual columns. If this parameter is not provided, all INFO
1778        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1779        a list to the `
1780        :type fields: list
1781        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1782        determines whether to drop and recreate a column if it already exists in the table. If `force`
1783        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1784        defaults to False
1785        :type force: bool (optional)
1786        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1787        flag that determines whether to process all the INFO fields together or individually. If set to
1788        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1789        be processed individually. The default value is, defaults to False
1790        :type proccess_all_fields_together: bool (optional)
1791        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1792        of the table where the exploded INFO fields will be added as individual columns. If you provide
1793        a value for the `table` parameter, the function will use that table name. If the `table`
1794        parameter is
1795        :type table: str
1796        :return: The `explode_infos` function returns a list of added columns.
1797        """
1798
1799        # drop indexes
1800        self.drop_indexes()
1801
1802        # connexion format
1803        connexion_format = self.get_connexion_format()
1804
1805        # Access
1806        access = self.get_config().get("access", None)
1807
1808        # Added columns
1809        added_columns = []
1810
1811        if access not in ["RO"]:
1812
1813            # prefix
1814            if prefix in [None, True] or not isinstance(prefix, str):
1815                if self.get_explode_infos_prefix() not in [None, True]:
1816                    prefix = self.get_explode_infos_prefix()
1817                else:
1818                    prefix = "INFO/"
1819
1820            # table variants
1821            if table is not None:
1822                table_variants = table
1823            else:
1824                table_variants = self.get_table_variants(clause="select")
1825
1826            # extra infos
1827            try:
1828                extra_infos = self.get_extra_infos()
1829            except:
1830                extra_infos = []
1831
1832            # Header infos
1833            header_infos = self.get_header().infos
1834
1835            log.debug(
1836                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1837            )
1838
1839            sql_info_alter_table_array = []
1840
1841            # Info fields to check
1842            fields_list = list(header_infos)
1843            if fields:
1844                fields_list += fields
1845            fields_list = set(fields_list)
1846
1847            # If no fields
1848            if not fields:
1849                fields = []
1850
1851            # Translate fields if patterns
1852            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1853
1854            for info in fields:
1855
1856                info_id_sql = prefix + info
1857
1858                if (
1859                    info in fields_list
1860                    or prefix + info in fields_list
1861                    or info in extra_infos
1862                ):
1863
1864                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1865
1866                    if info in header_infos:
1867                        info_type = header_infos[info].type
1868                        info_num = header_infos[info].num
1869                    else:
1870                        info_type = "String"
1871                        info_num = 0
1872
1873                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1874                    if info_num != 1:
1875                        type_sql = "VARCHAR"
1876
1877                    # Add field
1878                    added_column = self.add_column(
1879                        table_name=table_variants,
1880                        column_name=info_id_sql,
1881                        column_type=type_sql,
1882                        default_value="null",
1883                        drop=force,
1884                    )
1885
1886                    if added_column:
1887                        added_columns.append(added_column)
1888
1889                    if added_column or force:
1890
1891                        # add field to index
1892                        self.index_additionnal_fields.append(info_id_sql)
1893
1894                        # Update field array
1895                        if connexion_format in ["duckdb"]:
1896                            update_info_field = f"""
1897                            "{info_id_sql}" =
1898                                CASE
1899                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1900                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1901                                END
1902                            """
1903                        elif connexion_format in ["sqlite"]:
1904                            update_info_field = f"""
1905                                "{info_id_sql}" =
1906                                    CASE
1907                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1908                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1909                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1910                                    END
1911                            """
1912
1913                        sql_info_alter_table_array.append(update_info_field)
1914
1915            if sql_info_alter_table_array:
1916
1917                # By chromosomes
1918                try:
1919                    chromosomes_list = list(
1920                        self.get_query_to_df(
1921                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1922                        )["#CHROM"]
1923                    )
1924                except:
1925                    chromosomes_list = [None]
1926
1927                for chrom in chromosomes_list:
1928                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1929
1930                    # Where clause
1931                    where_clause = ""
1932                    if chrom and len(chromosomes_list) > 1:
1933                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1934
1935                    # Update table
1936                    if proccess_all_fields_together:
1937                        sql_info_alter_table_array_join = ", ".join(
1938                            sql_info_alter_table_array
1939                        )
1940                        if sql_info_alter_table_array_join:
1941                            sql_info_alter_table = f"""
1942                                UPDATE {table_variants}
1943                                SET {sql_info_alter_table_array_join}
1944                                {where_clause}
1945                                """
1946                            log.debug(
1947                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1948                            )
1949                            # log.debug(sql_info_alter_table)
1950                            self.conn.execute(sql_info_alter_table)
1951                    else:
1952                        sql_info_alter_num = 0
1953                        for sql_info_alter in sql_info_alter_table_array:
1954                            sql_info_alter_num += 1
1955                            sql_info_alter_table = f"""
1956                                UPDATE {table_variants}
1957                                SET {sql_info_alter}
1958                                {where_clause}
1959                                """
1960                            log.debug(
1961                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1962                            )
1963                            # log.debug(sql_info_alter_table)
1964                            self.conn.execute(sql_info_alter_table)
1965
1966        # create indexes
1967        if create_index:
1968            self.create_indexes()
1969
1970        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1972    def create_indexes(self) -> None:
1973        """
1974        Create indexes on the table after insertion
1975        """
1976
1977        # Access
1978        access = self.get_config().get("access", None)
1979
1980        # get table variants
1981        table_variants = self.get_table_variants("FROM")
1982
1983        if self.get_indexing() and access not in ["RO"]:
1984            # Create index
1985            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1986            self.conn.execute(sql_create_table_index)
1987            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1988            self.conn.execute(sql_create_table_index)
1989            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1990            self.conn.execute(sql_create_table_index)
1991            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1992            self.conn.execute(sql_create_table_index)
1993            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1994            self.conn.execute(sql_create_table_index)
1995            for field in self.index_additionnal_fields:
1996                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1997                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1999    def drop_indexes(self) -> None:
2000        """
2001        Create indexes on the table after insertion
2002        """
2003
2004        # Access
2005        access = self.get_config().get("access", None)
2006
2007        # get table variants
2008        table_variants = self.get_table_variants("FROM")
2009
2010        # Get database format
2011        connexion_format = self.get_connexion_format()
2012
2013        if access not in ["RO"]:
2014            if connexion_format in ["duckdb"]:
2015                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2016            elif connexion_format in ["sqlite"]:
2017                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2018
2019            list_indexes = self.conn.execute(sql_list_indexes)
2020            index_names = [row[0] for row in list_indexes.fetchall()]
2021            for index in index_names:
2022                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2023                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2025    def read_vcf_header(self, f) -> list:
2026        """
2027        It reads the header of a VCF file and returns a list of the header lines
2028
2029        :param f: the file object
2030        :return: The header lines of the VCF file.
2031        """
2032
2033        header_list = []
2034        for line in f:
2035            header_list.append(line)
2036            if line.startswith("#CHROM"):
2037                break
2038        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2040    def read_vcf_header_file(self, file: str = None) -> list:
2041        """
2042        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2043        uncompressed files.
2044
2045        :param file: The `file` parameter is a string that represents the path to the VCF header file
2046        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2047        default to `None`
2048        :type file: str
2049        :return: The function `read_vcf_header_file` returns a list.
2050        """
2051
2052        if self.get_input_compressed(input_file=file):
2053            with bgzf.open(file, "rt") as f:
2054                return self.read_vcf_header(f=f)
2055        else:
2056            with open(file, "rt") as f:
2057                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2059    def execute_query(self, query: str):
2060        """
2061        It takes a query as an argument, executes it, and returns the results
2062
2063        :param query: The query to be executed
2064        :return: The result of the query is being returned.
2065        """
2066        if query:
2067            return self.conn.execute(query)  # .fetchall()
2068        else:
2069            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
2071    def export_output(
2072        self,
2073        output_file: str | None = None,
2074        output_header: str | None = None,
2075        export_header: bool = True,
2076        query: str | None = None,
2077        parquet_partitions: list | None = None,
2078        chunk_size: int | None = None,
2079        threads: int | None = None,
2080        sort: bool = False,
2081        index: bool = False,
2082        order_by: str | None = None,
2083    ) -> bool:
2084        """
2085        The `export_output` function exports data from a VCF file to a specified output file in various
2086        formats, including VCF, CSV, TSV, PSV, and Parquet.
2087
2088        :param output_file: The `output_file` parameter is a string that specifies the name of the
2089        output file to be generated by the function. This is where the exported data will be saved
2090        :type output_file: str
2091        :param output_header: The `output_header` parameter is a string that specifies the name of the
2092        file where the header of the VCF file will be exported. If this parameter is not provided, the
2093        header will be exported to a file with the same name as the `output_file` parameter, but with
2094        the extension "
2095        :type output_header: str
2096        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2097        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2098        True, the header will be exported to a file. If `export_header` is False, the header will not
2099        be, defaults to True, if output format is not VCF
2100        :type export_header: bool (optional)
2101        :param query: The `query` parameter is an optional SQL query that can be used to filter and
2102        select specific data from the VCF file before exporting it. If provided, only the data that
2103        matches the query will be exported
2104        :type query: str
2105        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2106        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2107        organize data in a hierarchical directory structure based on the values of one or more columns.
2108        This can improve query performance when working with large datasets
2109        :type parquet_partitions: list
2110        :param chunk_size: The `chunk_size` parameter specifies the number of
2111        records in batch when exporting data in Parquet format. This parameter is used for
2112        partitioning the Parquet file into multiple files.
2113        :type chunk_size: int
2114        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2115        threads to be used during the export process. It determines the level of parallelism and can
2116        improve the performance of the export operation. If not provided, the function will use the
2117        default number of threads
2118        :type threads: int
2119        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2120        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2121        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2122        False
2123        :type sort: bool (optional)
2124        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2125        created on the output file. If `index` is True, an index will be created. If `index` is False,
2126        no index will be created. The default value is False, defaults to False
2127        :type index: bool (optional)
2128        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2129        sorting the output file. This parameter is only applicable when exporting data in VCF format
2130        :type order_by: str
2131        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2132        None if it doesn't.
2133        """
2134
2135        # Log
2136        log.info("Exporting...")
2137
2138        # Full path
2139        output_file = full_path(output_file)
2140        output_header = full_path(output_header)
2141
2142        # Config
2143        config = self.get_config()
2144
2145        # Param
2146        param = self.get_param()
2147
2148        # Tmp files to remove
2149        tmp_to_remove = []
2150
2151        # If no output, get it
2152        if not output_file:
2153            output_file = self.get_output()
2154
2155        # If not threads
2156        if not threads:
2157            threads = self.get_threads()
2158
2159        # Auto header name with extension
2160        if export_header or output_header:
2161            if not output_header:
2162                output_header = f"{output_file}.hdr"
2163            # Export header
2164            self.export_header(output_file=output_file)
2165
2166        # Switch off export header if VCF output
2167        output_file_type = get_file_format(output_file)
2168        if output_file_type in ["vcf"]:
2169            export_header = False
2170            tmp_to_remove.append(output_header)
2171
2172        # Chunk size
2173        if not chunk_size:
2174            chunk_size = config.get("chunk_size", None)
2175
2176        # Parquet partition
2177        if not parquet_partitions:
2178            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2179        if parquet_partitions and isinstance(parquet_partitions, str):
2180            parquet_partitions = parquet_partitions.split(",")
2181
2182        # Order by
2183        if not order_by:
2184            order_by = param.get("export", {}).get("order_by", "")
2185
2186        # Header in output
2187        header_in_output = param.get("export", {}).get("include_header", False)
2188
2189        # Database
2190        database_source = self.get_connexion()
2191
2192        # Connexion format
2193        connexion_format = self.get_connexion_format()
2194
2195        # Explode infos
2196        if self.get_explode_infos():
2197            self.explode_infos(
2198                prefix=self.get_explode_infos_prefix(),
2199                fields=self.get_explode_infos_fields(),
2200                force=False,
2201            )
2202
2203        # if connexion_format in ["sqlite"] or query:
2204        if connexion_format in ["sqlite"]:
2205
2206            # Export in Parquet
2207            random_tmp = "".join(
2208                random.choice(string.ascii_lowercase) for i in range(10)
2209            )
2210            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2211            tmp_to_remove.append(database_source)
2212
2213            # Table Variants
2214            table_variants = self.get_table_variants()
2215
2216            # Create export query
2217            sql_query_export_subquery = f"""
2218                SELECT * FROM {table_variants}
2219                """
2220
2221            # Write source file
2222            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2223
2224        # Create database
2225        database = Database(
2226            database=database_source,
2227            table="variants",
2228            header_file=output_header,
2229            conn_config=self.get_connexion_config(),
2230        )
2231
2232        # Existing colomns header
2233        existing_columns_header = database.get_header_columns_from_database()
2234
2235        # Sample list
2236        if output_file_type in ["vcf"]:
2237            get_samples = self.get_samples()
2238            get_samples_check = self.get_samples_check()
2239            samples_force = get_samples is not None
2240            sample_list = self.get_header_sample_list(
2241                check=get_samples_check,
2242                samples=get_samples,
2243                samples_force=samples_force,
2244            )
2245        else:
2246            sample_list = None
2247
2248        # Export file
2249        database.export(
2250            output_database=output_file,
2251            output_header=output_header,
2252            existing_columns_header=existing_columns_header,
2253            parquet_partitions=parquet_partitions,
2254            chunk_size=chunk_size,
2255            threads=threads,
2256            sort=sort,
2257            index=index,
2258            header_in_output=header_in_output,
2259            order_by=order_by,
2260            query=query,
2261            export_header=export_header,
2262            sample_list=sample_list,
2263        )
2264
2265        # Remove
2266        remove_if_exists(tmp_to_remove)
2267
2268        return (os.path.exists(output_file) or None) and (
2269            os.path.exists(output_file) or None
2270        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2272    def get_extra_infos(self, table: str = None) -> list:
2273        """
2274        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2275        in the header.
2276
2277        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2278        name of the table from which you want to retrieve the extra columns that are not present in the
2279        header. If the `table` parameter is not provided when calling the function, it will default to
2280        using the variants
2281        :type table: str
2282        :return: A list of columns that are in the specified table but not in the header of the table.
2283        """
2284
2285        header_columns = []
2286
2287        if not table:
2288            table = self.get_table_variants(clause="from")
2289            header_columns = self.get_header_columns()
2290
2291        # Check all columns in the database
2292        query = f""" SELECT * FROM {table} LIMIT 1 """
2293        log.debug(f"query {query}")
2294        table_columns = self.get_query_to_df(query).columns.tolist()
2295        extra_columns = []
2296
2297        # Construct extra infos (not in header)
2298        for column in table_columns:
2299            if column not in header_columns:
2300                extra_columns.append(column)
2301
2302        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2304    def get_extra_infos_sql(self, table: str = None) -> str:
2305        """
2306        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2307        by double quotes
2308
2309        :param table: The name of the table to get the extra infos from. If None, the default table is
2310        used
2311        :type table: str
2312        :return: A string of the extra infos
2313        """
2314
2315        return ", ".join(
2316            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2317        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2319    def export_header(
2320        self,
2321        header_name: str = None,
2322        output_file: str = None,
2323        output_file_ext: str = ".hdr",
2324        clean_header: bool = True,
2325        remove_chrom_line: bool = False,
2326    ) -> str:
2327        """
2328        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2329        specified options, and writes it to a new file.
2330
2331        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2332        this parameter is not specified, the header will be written to the output file
2333        :type header_name: str
2334        :param output_file: The `output_file` parameter in the `export_header` function is used to
2335        specify the name of the output file where the header will be written. If this parameter is not
2336        provided, the header will be written to a temporary file
2337        :type output_file: str
2338        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2339        string that represents the extension of the output header file. By default, it is set to ".hdr"
2340        if not specified by the user. This extension will be appended to the `output_file` name to
2341        create the final, defaults to .hdr
2342        :type output_file_ext: str (optional)
2343        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2344        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2345        `True`, the function will clean the header by modifying certain lines based on a specific
2346        pattern. If `clean_header`, defaults to True
2347        :type clean_header: bool (optional)
2348        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2349        boolean flag that determines whether the #CHROM line should be removed from the header before
2350        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2351        defaults to False
2352        :type remove_chrom_line: bool (optional)
2353        :return: The function `export_header` returns the name of the temporary header file that is
2354        created.
2355        """
2356
2357        if not header_name and not output_file:
2358            output_file = self.get_output()
2359
2360        if self.get_header():
2361
2362            # Get header object
2363            header_obj = self.get_header()
2364
2365            # Create database
2366            db_for_header = Database(database=self.get_input())
2367
2368            # Get real columns in the file
2369            db_header_columns = db_for_header.get_columns()
2370
2371            with tempfile.TemporaryDirectory() as tmpdir:
2372
2373                # Write header file
2374                header_file_tmp = os.path.join(tmpdir, "header")
2375                f = open(header_file_tmp, "w")
2376                vcf.Writer(f, header_obj)
2377                f.close()
2378
2379                # Replace #CHROM line with rel columns
2380                header_list = db_for_header.read_header_file(
2381                    header_file=header_file_tmp
2382                )
2383                header_list[-1] = "\t".join(db_header_columns)
2384
2385                # Remove CHROM line
2386                if remove_chrom_line:
2387                    header_list.pop()
2388
2389                # Clean header
2390                if clean_header:
2391                    header_list_clean = []
2392                    for head in header_list:
2393                        # Clean head for malformed header
2394                        head_clean = head
2395                        head_clean = re.subn(
2396                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2397                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2398                            head_clean,
2399                            2,
2400                        )[0]
2401                        # Write header
2402                        header_list_clean.append(head_clean)
2403                    header_list = header_list_clean
2404
2405            tmp_header_name = output_file + output_file_ext
2406
2407            f = open(tmp_header_name, "w")
2408            for line in header_list:
2409                f.write(line)
2410            f.close()
2411
2412        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2414    def export_variant_vcf(
2415        self,
2416        vcf_file,
2417        remove_info: bool = False,
2418        add_samples: bool = True,
2419        list_samples: list = [],
2420        where_clause: str = "",
2421        index: bool = False,
2422        threads: int | None = None,
2423    ) -> bool | None:
2424        """
2425        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2426        remove INFO field, add samples, and control compression and indexing.
2427
2428        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2429        written to. It is the output file that will contain the filtered VCF data based on the specified
2430        parameters
2431        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2432        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2433        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2434        in, defaults to False
2435        :type remove_info: bool (optional)
2436        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2437        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2438        If set to False, the samples will be removed. The default value is True, defaults to True
2439        :type add_samples: bool (optional)
2440        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2441        in the output VCF file. By default, all samples will be included. If you provide a list of
2442        samples, only those samples will be included in the output file
2443        :type list_samples: list
2444        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2445        determines whether or not to create an index for the output VCF file. If `index` is set to
2446        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2447        :type index: bool (optional)
2448        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2449        number of threads to use for exporting the VCF file. It determines how many parallel threads
2450        will be used during the export process. More threads can potentially speed up the export process
2451        by utilizing multiple cores of the processor. If
2452        :type threads: int | None
2453        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2454        method with various parameters including the output file, query, threads, sort flag, and index
2455        flag. The `export_output` method is responsible for exporting the VCF data based on the
2456        specified parameters and configurations provided in the `export_variant_vcf` function.
2457        """
2458
2459        # Config
2460        config = self.get_config()
2461
2462        # Extract VCF
2463        log.debug("Export VCF...")
2464
2465        # Table variants
2466        table_variants = self.get_table_variants()
2467
2468        # Threads
2469        if not threads:
2470            threads = self.get_threads()
2471
2472        # Info fields
2473        if remove_info:
2474            if not isinstance(remove_info, str):
2475                remove_info = "."
2476            info_field = f"""'{remove_info}' as INFO"""
2477        else:
2478            info_field = "INFO"
2479
2480        # Samples fields
2481        if add_samples:
2482            if not list_samples:
2483                list_samples = self.get_header_sample_list()
2484            if list_samples:
2485                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2486            else:
2487                samples_fields = ""
2488            log.debug(f"samples_fields: {samples_fields}")
2489        else:
2490            samples_fields = ""
2491
2492        # Where clause
2493        if where_clause is None:
2494            where_clause = ""
2495
2496        # Variants
2497        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2498        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2499        log.debug(f"sql_query_select={sql_query_select}")
2500
2501        return self.export_output(
2502            output_file=vcf_file,
2503            output_header=None,
2504            export_header=True,
2505            query=sql_query_select,
2506            parquet_partitions=None,
2507            chunk_size=config.get("chunk_size", None),
2508            threads=threads,
2509            sort=True,
2510            index=index,
2511            order_by=None,
2512        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2514    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2515        """
2516        It takes a list of commands and runs them in parallel using the number of threads specified
2517
2518        :param commands: A list of commands to run
2519        :param threads: The number of threads to use, defaults to 1 (optional)
2520        """
2521
2522        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2524    def get_threads(self, default: int = 1) -> int:
2525        """
2526        This function returns the number of threads to use for a job, with a default value of 1 if not
2527        specified.
2528
2529        :param default: The `default` parameter in the `get_threads` method is used to specify the
2530        default number of threads to use if no specific value is provided. If no value is provided for
2531        the `threads` parameter in the configuration or input parameters, the `default` value will be
2532        used, defaults to 1
2533        :type default: int (optional)
2534        :return: the number of threads to use for the current job.
2535        """
2536
2537        # Config
2538        config = self.get_config()
2539
2540        # Param
2541        param = self.get_param()
2542
2543        # Input threads
2544        input_thread = param.get("threads", config.get("threads", None))
2545
2546        # Check threads
2547        if not input_thread:
2548            threads = default
2549        elif int(input_thread) <= 0:
2550            threads = os.cpu_count()
2551        else:
2552            threads = int(input_thread)
2553        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2555    def get_memory(self, default: str = None) -> str:
2556        """
2557        This function retrieves the memory value from parameters or configuration with a default value
2558        if not found.
2559
2560        :param default: The `get_memory` function takes in a default value as a string parameter. This
2561        default value is used as a fallback in case the `memory` parameter is not provided in the
2562        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2563        the function
2564        :type default: str
2565        :return: The `get_memory` function returns a string value representing the memory parameter. If
2566        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2567        return the default value provided as an argument to the function.
2568        """
2569
2570        # Config
2571        config = self.get_config()
2572
2573        # Param
2574        param = self.get_param()
2575
2576        # Input threads
2577        input_memory = param.get("memory", config.get("memory", None))
2578
2579        # Check threads
2580        if input_memory:
2581            memory = input_memory
2582        else:
2583            memory = default
2584
2585        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2587    def update_from_vcf(self, vcf_file: str) -> None:
2588        """
2589        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2590
2591        :param vcf_file: the path to the VCF file
2592        """
2593
2594        connexion_format = self.get_connexion_format()
2595
2596        if connexion_format in ["duckdb"]:
2597            self.update_from_vcf_duckdb(vcf_file)
2598        elif connexion_format in ["sqlite"]:
2599            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2601    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2602        """
2603        It takes a VCF file and updates the INFO column of the variants table in the database with the
2604        INFO column of the VCF file
2605
2606        :param vcf_file: the path to the VCF file
2607        """
2608
2609        # varaints table
2610        table_variants = self.get_table_variants()
2611
2612        # Loading VCF into temporaire table
2613        skip = self.get_header_length(file=vcf_file)
2614        vcf_df = pd.read_csv(
2615            vcf_file,
2616            sep="\t",
2617            engine="c",
2618            skiprows=skip,
2619            header=0,
2620            low_memory=False,
2621        )
2622        sql_query_update = f"""
2623        UPDATE {table_variants} as table_variants
2624            SET INFO = concat(
2625                            CASE
2626                                WHEN INFO NOT IN ('', '.')
2627                                THEN INFO
2628                                ELSE ''
2629                            END,
2630                            (
2631                                SELECT 
2632                                    concat(
2633                                        CASE
2634                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2635                                            THEN ';'
2636                                            ELSE ''
2637                                        END
2638                                        ,
2639                                        CASE
2640                                            WHEN table_parquet.INFO NOT IN ('','.')
2641                                            THEN table_parquet.INFO
2642                                            ELSE ''
2643                                        END
2644                                    )
2645                                FROM vcf_df as table_parquet
2646                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2647                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2648                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2649                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2650                                        AND table_parquet.INFO NOT IN ('','.')
2651                            )
2652                        )
2653            ;
2654            """
2655        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2657    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2658        """
2659        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2660        table, then updates the INFO column of the variants table with the INFO column of the temporary
2661        table
2662
2663        :param vcf_file: The path to the VCF file you want to update the database with
2664        """
2665
2666        # Create a temporary table for the VCF
2667        table_vcf = "tmp_vcf"
2668        sql_create = (
2669            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2670        )
2671        self.conn.execute(sql_create)
2672
2673        # Loading VCF into temporaire table
2674        vcf_df = pd.read_csv(
2675            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2676        )
2677        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2678        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2679
2680        # Update table 'variants' with VCF data
2681        # warning: CONCAT as || operator
2682        sql_query_update = f"""
2683            UPDATE variants as table_variants
2684            SET INFO = CASE
2685                            WHEN INFO NOT IN ('', '.')
2686                            THEN INFO
2687                            ELSE ''
2688                        END ||
2689                        (
2690                        SELECT 
2691                            CASE 
2692                                WHEN table_variants.INFO NOT IN ('','.') 
2693                                    AND table_vcf.INFO NOT IN ('','.')  
2694                                THEN ';' 
2695                                ELSE '' 
2696                            END || 
2697                            CASE 
2698                                WHEN table_vcf.INFO NOT IN ('','.') 
2699                                THEN table_vcf.INFO 
2700                                ELSE '' 
2701                            END
2702                        FROM {table_vcf} as table_vcf
2703                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2704                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2705                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2706                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2707                        )
2708        """
2709        self.conn.execute(sql_query_update)
2710
2711        # Drop temporary table
2712        sql_drop = f"DROP TABLE {table_vcf}"
2713        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2715    def drop_variants_table(self) -> None:
2716        """
2717        > This function drops the variants table
2718        """
2719
2720        table_variants = self.get_table_variants()
2721        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2722        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2724    def set_variant_id(
2725        self, variant_id_column: str = "variant_id", force: bool = None
2726    ) -> str:
2727        """
2728        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2729        `#CHROM`, `POS`, `REF`, and `ALT` columns
2730
2731        :param variant_id_column: The name of the column to be created in the variants table, defaults
2732        to variant_id
2733        :type variant_id_column: str (optional)
2734        :param force: If True, the variant_id column will be created even if it already exists
2735        :type force: bool
2736        :return: The name of the column that contains the variant_id
2737        """
2738
2739        # Assembly
2740        assembly = self.get_param().get(
2741            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2742        )
2743
2744        # INFO/Tag prefix
2745        prefix = self.get_explode_infos_prefix()
2746
2747        # Explode INFO/SVTYPE
2748        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2749
2750        # variants table
2751        table_variants = self.get_table_variants()
2752
2753        # variant_id column
2754        if not variant_id_column:
2755            variant_id_column = "variant_id"
2756
2757        # Creta variant_id column
2758        if "variant_id" not in self.get_extra_infos() or force:
2759
2760            # Create column
2761            self.add_column(
2762                table_name=table_variants,
2763                column_name=variant_id_column,
2764                column_type="UBIGINT",
2765                default_value="0",
2766            )
2767
2768            # Update column
2769            self.conn.execute(
2770                f"""
2771                    UPDATE {table_variants}
2772                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2773                """
2774            )
2775
2776        # Remove added columns
2777        for added_column in added_columns:
2778            self.drop_column(column=added_column)
2779
2780        # return variant_id column name
2781        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2783    def get_variant_id_column(
2784        self, variant_id_column: str = "variant_id", force: bool = None
2785    ) -> str:
2786        """
2787        This function returns the variant_id column name
2788
2789        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2790        defaults to variant_id
2791        :type variant_id_column: str (optional)
2792        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2793        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2794        if it is not already set, or if it is set
2795        :type force: bool
2796        :return: The variant_id column name.
2797        """
2798
2799        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2805    def scan_databases(
2806        self,
2807        database_formats: list = ["parquet"],
2808        database_releases: list = ["current"],
2809    ) -> dict:
2810        """
2811        The function `scan_databases` scans for available databases based on specified formats and
2812        releases.
2813
2814        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2815        of the databases to be scanned. In this case, the accepted format is "parquet"
2816        :type database_formats: list ["parquet"]
2817        :param database_releases: The `database_releases` parameter is a list that specifies the
2818        releases of the databases to be scanned. In the provided function, the default value for
2819        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2820        databases that are in the "current"
2821        :type database_releases: list
2822        :return: The function `scan_databases` returns a dictionary containing information about
2823        databases that match the specified formats and releases.
2824        """
2825
2826        # Config
2827        config = self.get_config()
2828
2829        # Param
2830        param = self.get_param()
2831
2832        # Param - Assembly
2833        assembly = param.get("assembly", config.get("assembly", None))
2834        if not assembly:
2835            assembly = DEFAULT_ASSEMBLY
2836            log.warning(f"Default assembly '{assembly}'")
2837
2838        # Scan for availabled databases
2839        log.info(
2840            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2841        )
2842        databases_infos_dict = databases_infos(
2843            database_folder_releases=database_releases,
2844            database_formats=database_formats,
2845            assembly=assembly,
2846            config=config,
2847        )
2848        log.info(
2849            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2850        )
2851
2852        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2854    def annotation(self) -> None:
2855        """
2856        It annotates the VCF file with the annotations specified in the config file.
2857        """
2858
2859        # Config
2860        config = self.get_config()
2861
2862        # Param
2863        param = self.get_param()
2864
2865        # Param - Assembly
2866        assembly = param.get("assembly", config.get("assembly", None))
2867        if not assembly:
2868            assembly = DEFAULT_ASSEMBLY
2869            log.warning(f"Default assembly '{assembly}'")
2870
2871        # annotations databases folders
2872        annotations_databases = set(
2873            config.get("folders", {})
2874            .get("databases", {})
2875            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2876            + config.get("folders", {})
2877            .get("databases", {})
2878            .get("parquet", ["~/howard/databases/parquet/current"])
2879            + config.get("folders", {})
2880            .get("databases", {})
2881            .get("bcftools", ["~/howard/databases/bcftools/current"])
2882        )
2883
2884        # Get param annotations
2885        if param.get("annotations", None) and isinstance(
2886            param.get("annotations", None), str
2887        ):
2888            log.debug(param.get("annotations", None))
2889            param_annotation_list = param.get("annotations").split(",")
2890        else:
2891            param_annotation_list = []
2892
2893        # Each tools param
2894        if param.get("annotation_parquet", None) != None:
2895            log.debug(
2896                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2897            )
2898            if isinstance(param.get("annotation_parquet", None), list):
2899                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2900            else:
2901                param_annotation_list.append(param.get("annotation_parquet"))
2902        if param.get("annotation_snpsift", None) != None:
2903            if isinstance(param.get("annotation_snpsift", None), list):
2904                param_annotation_list.append(
2905                    "snpsift:"
2906                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2907                )
2908            else:
2909                param_annotation_list.append(
2910                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2911                )
2912        if param.get("annotation_snpeff", None) != None:
2913            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2914        if param.get("annotation_bcftools", None) != None:
2915            if isinstance(param.get("annotation_bcftools", None), list):
2916                param_annotation_list.append(
2917                    "bcftools:"
2918                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2919                )
2920            else:
2921                param_annotation_list.append(
2922                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2923                )
2924        if param.get("annotation_annovar", None) != None:
2925            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2926        if param.get("annotation_exomiser", None) != None:
2927            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2928        if param.get("annotation_splice", None) != None:
2929            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2930
2931        # Merge param annotations list
2932        param["annotations"] = ",".join(param_annotation_list)
2933
2934        # debug
2935        log.debug(f"param_annotations={param['annotations']}")
2936
2937        if param.get("annotations"):
2938
2939            # Log
2940            # log.info("Annotations - Check annotation parameters")
2941
2942            if not "annotation" in param:
2943                param["annotation"] = {}
2944
2945            # List of annotations parameters
2946            annotations_list_input = {}
2947            if isinstance(param.get("annotations", None), str):
2948                annotation_file_list = [
2949                    value for value in param.get("annotations", "").split(",")
2950                ]
2951                for annotation_file in annotation_file_list:
2952                    annotations_list_input[annotation_file] = {"INFO": None}
2953            else:
2954                annotations_list_input = param.get("annotations", {})
2955
2956            log.info(f"Quick Annotations:")
2957            for annotation_key in list(annotations_list_input.keys()):
2958                log.info(f"   {annotation_key}")
2959
2960            # List of annotations and associated fields
2961            annotations_list = {}
2962
2963            for annotation_file in annotations_list_input:
2964
2965                # Explode annotations if ALL
2966                if (
2967                    annotation_file.upper() == "ALL"
2968                    or annotation_file.upper().startswith("ALL:")
2969                ):
2970
2971                    # check ALL parameters (formats, releases)
2972                    annotation_file_split = annotation_file.split(":")
2973                    database_formats = "parquet"
2974                    database_releases = "current"
2975                    for annotation_file_option in annotation_file_split[1:]:
2976                        database_all_options_split = annotation_file_option.split("=")
2977                        if database_all_options_split[0] == "format":
2978                            database_formats = database_all_options_split[1].split("+")
2979                        if database_all_options_split[0] == "release":
2980                            database_releases = database_all_options_split[1].split("+")
2981
2982                    # Scan for availabled databases
2983                    databases_infos_dict = self.scan_databases(
2984                        database_formats=database_formats,
2985                        database_releases=database_releases,
2986                    )
2987
2988                    # Add found databases in annotation parameters
2989                    for database_infos in databases_infos_dict.keys():
2990                        annotations_list[database_infos] = {"INFO": None}
2991
2992                else:
2993                    annotations_list[annotation_file] = annotations_list_input[
2994                        annotation_file
2995                    ]
2996
2997            # Check each databases
2998            if len(annotations_list):
2999
3000                log.info(
3001                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
3002                )
3003
3004                for annotation_file in annotations_list:
3005
3006                    # Init
3007                    annotations = annotations_list.get(annotation_file, None)
3008
3009                    # Annotation snpEff
3010                    if annotation_file.startswith("snpeff"):
3011
3012                        log.debug(f"Quick Annotation snpEff")
3013
3014                        if "snpeff" not in param["annotation"]:
3015                            param["annotation"]["snpeff"] = {}
3016
3017                        if "options" not in param["annotation"]["snpeff"]:
3018                            param["annotation"]["snpeff"]["options"] = ""
3019
3020                        # snpEff options in annotations
3021                        param["annotation"]["snpeff"]["options"] = "".join(
3022                            annotation_file.split(":")[1:]
3023                        )
3024
3025                    # Annotation Annovar
3026                    elif annotation_file.startswith("annovar"):
3027
3028                        log.debug(f"Quick Annotation Annovar")
3029
3030                        if "annovar" not in param["annotation"]:
3031                            param["annotation"]["annovar"] = {}
3032
3033                        if "annotations" not in param["annotation"]["annovar"]:
3034                            param["annotation"]["annovar"]["annotations"] = {}
3035
3036                        # Options
3037                        annotation_file_split = annotation_file.split(":")
3038                        for annotation_file_annotation in annotation_file_split[1:]:
3039                            if annotation_file_annotation:
3040                                param["annotation"]["annovar"]["annotations"][
3041                                    annotation_file_annotation
3042                                ] = annotations
3043
3044                    # Annotation Exomiser
3045                    elif annotation_file.startswith("exomiser"):
3046
3047                        log.debug(f"Quick Annotation Exomiser")
3048
3049                        param["annotation"]["exomiser"] = params_string_to_dict(
3050                            annotation_file
3051                        )
3052
3053                    # Annotation Splice
3054                    elif annotation_file.startswith("splice"):
3055
3056                        log.debug(f"Quick Annotation Splice")
3057
3058                        param["annotation"]["splice"] = params_string_to_dict(
3059                            annotation_file
3060                        )
3061
3062                    # Annotation Parquet or BCFTOOLS
3063                    else:
3064
3065                        # Tools detection
3066                        if annotation_file.startswith("bcftools:"):
3067                            annotation_tool_initial = "bcftools"
3068                            annotation_file = ":".join(annotation_file.split(":")[1:])
3069                        elif annotation_file.startswith("snpsift:"):
3070                            annotation_tool_initial = "snpsift"
3071                            annotation_file = ":".join(annotation_file.split(":")[1:])
3072                        else:
3073                            annotation_tool_initial = None
3074
3075                        # list of files
3076                        annotation_file_list = annotation_file.replace("+", ":").split(
3077                            ":"
3078                        )
3079
3080                        for annotation_file in annotation_file_list:
3081
3082                            if annotation_file:
3083
3084                                # Annotation tool initial
3085                                annotation_tool = annotation_tool_initial
3086
3087                                # Find file
3088                                annotation_file_found = None
3089
3090                                # Expand user
3091                                annotation_file = full_path(annotation_file)
3092
3093                                if os.path.exists(annotation_file):
3094                                    annotation_file_found = annotation_file
3095
3096                                else:
3097                                    # Find within assembly folders
3098                                    for annotations_database in annotations_databases:
3099                                        found_files = find_all(
3100                                            annotation_file,
3101                                            os.path.join(
3102                                                annotations_database, assembly
3103                                            ),
3104                                        )
3105                                        if len(found_files) > 0:
3106                                            annotation_file_found = found_files[0]
3107                                            break
3108                                    if not annotation_file_found and not assembly:
3109                                        # Find within folders
3110                                        for (
3111                                            annotations_database
3112                                        ) in annotations_databases:
3113                                            found_files = find_all(
3114                                                annotation_file, annotations_database
3115                                            )
3116                                            if len(found_files) > 0:
3117                                                annotation_file_found = found_files[0]
3118                                                break
3119                                log.debug(
3120                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3121                                )
3122
3123                                # Full path
3124                                annotation_file_found = full_path(annotation_file_found)
3125
3126                                if annotation_file_found:
3127
3128                                    database = Database(database=annotation_file_found)
3129                                    quick_annotation_format = database.get_format()
3130                                    quick_annotation_is_compressed = (
3131                                        database.is_compressed()
3132                                    )
3133                                    quick_annotation_is_indexed = os.path.exists(
3134                                        f"{annotation_file_found}.tbi"
3135                                    )
3136                                    bcftools_preference = False
3137
3138                                    # Check Annotation Tool
3139                                    if not annotation_tool:
3140                                        if (
3141                                            bcftools_preference
3142                                            and quick_annotation_format
3143                                            in ["vcf", "bed"]
3144                                            and quick_annotation_is_compressed
3145                                            and quick_annotation_is_indexed
3146                                        ):
3147                                            annotation_tool = "bcftools"
3148                                        elif quick_annotation_format in [
3149                                            "vcf",
3150                                            "bed",
3151                                            "tsv",
3152                                            "tsv",
3153                                            "csv",
3154                                            "json",
3155                                            "tbl",
3156                                            "parquet",
3157                                            "duckdb",
3158                                        ]:
3159                                            annotation_tool = "parquet"
3160                                        else:
3161                                            log.error(
3162                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3163                                            )
3164                                            raise ValueError(
3165                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3166                                            )
3167
3168                                    log.debug(
3169                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3170                                    )
3171
3172                                    # Annotation Tool dispatch
3173                                    if annotation_tool:
3174                                        if annotation_tool not in param["annotation"]:
3175                                            param["annotation"][annotation_tool] = {}
3176                                        if (
3177                                            "annotations"
3178                                            not in param["annotation"][annotation_tool]
3179                                        ):
3180                                            param["annotation"][annotation_tool][
3181                                                "annotations"
3182                                            ] = {}
3183                                        param["annotation"][annotation_tool][
3184                                            "annotations"
3185                                        ][annotation_file_found] = annotations
3186
3187                                else:
3188                                    log.error(
3189                                        f"Quick Annotation File {annotation_file} does NOT exist"
3190                                    )
3191
3192                self.set_param(param)
3193
3194        if param.get("annotation", None):
3195            log.info("Annotations")
3196            if param.get("annotation", {}).get("parquet", None):
3197                log.info("Annotations 'parquet'...")
3198                self.annotation_parquet()
3199            if param.get("annotation", {}).get("bcftools", None):
3200                log.info("Annotations 'bcftools'...")
3201                self.annotation_bcftools()
3202            if param.get("annotation", {}).get("snpsift", None):
3203                log.info("Annotations 'snpsift'...")
3204                self.annotation_snpsift()
3205            if param.get("annotation", {}).get("annovar", None):
3206                log.info("Annotations 'annovar'...")
3207                self.annotation_annovar()
3208            if param.get("annotation", {}).get("snpeff", None):
3209                log.info("Annotations 'snpeff'...")
3210                self.annotation_snpeff()
3211            if param.get("annotation", {}).get("exomiser", None) is not None:
3212                log.info("Annotations 'exomiser'...")
3213                self.annotation_exomiser()
3214            if param.get("annotation", {}).get("splice", None) is not None:
3215                log.info("Annotations 'splice' ...")
3216                self.annotation_splice()
3217
3218        # Explode INFOS fields into table fields
3219        if self.get_explode_infos():
3220            self.explode_infos(
3221                prefix=self.get_explode_infos_prefix(),
3222                fields=self.get_explode_infos_fields(),
3223                force=True,
3224            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3226    def annotation_snpsift(self, threads: int = None) -> None:
3227        """
3228        This function annotate with bcftools
3229
3230        :param threads: Number of threads to use
3231        :return: the value of the variable "return_value".
3232        """
3233
3234        # DEBUG
3235        log.debug("Start annotation with bcftools databases")
3236
3237        # Threads
3238        if not threads:
3239            threads = self.get_threads()
3240        log.debug("Threads: " + str(threads))
3241
3242        # Config
3243        config = self.get_config()
3244        log.debug("Config: " + str(config))
3245
3246        # Config - snpSift
3247        snpsift_bin_command = get_bin_command(
3248            bin="SnpSift.jar",
3249            tool="snpsift",
3250            bin_type="jar",
3251            config=config,
3252            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3253        )
3254        if not snpsift_bin_command:
3255            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3256            log.error(msg_err)
3257            raise ValueError(msg_err)
3258
3259        # Config - bcftools
3260        bcftools_bin_command = get_bin_command(
3261            bin="bcftools",
3262            tool="bcftools",
3263            bin_type="bin",
3264            config=config,
3265            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3266        )
3267        if not bcftools_bin_command:
3268            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3269            log.error(msg_err)
3270            raise ValueError(msg_err)
3271
3272        # Config - BCFTools databases folders
3273        databases_folders = set(
3274            self.get_config()
3275            .get("folders", {})
3276            .get("databases", {})
3277            .get("annotations", ["."])
3278            + self.get_config()
3279            .get("folders", {})
3280            .get("databases", {})
3281            .get("bcftools", ["."])
3282        )
3283        log.debug("Databases annotations: " + str(databases_folders))
3284
3285        # Param
3286        annotations = (
3287            self.get_param()
3288            .get("annotation", {})
3289            .get("snpsift", {})
3290            .get("annotations", None)
3291        )
3292        log.debug("Annotations: " + str(annotations))
3293
3294        # Assembly
3295        assembly = self.get_param().get(
3296            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3297        )
3298
3299        # Data
3300        table_variants = self.get_table_variants()
3301
3302        # Check if not empty
3303        log.debug("Check if not empty")
3304        sql_query_chromosomes = (
3305            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3306        )
3307        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3308        if not sql_query_chromosomes_df["count"][0]:
3309            log.info(f"VCF empty")
3310            return
3311
3312        # VCF header
3313        vcf_reader = self.get_header()
3314        log.debug("Initial header: " + str(vcf_reader.infos))
3315
3316        # Existing annotations
3317        for vcf_annotation in self.get_header().infos:
3318
3319            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3320            log.debug(
3321                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3322            )
3323
3324        if annotations:
3325
3326            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3327
3328                # Export VCF file
3329                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3330
3331                # Init
3332                commands = {}
3333
3334                for annotation in annotations:
3335                    annotation_fields = annotations[annotation]
3336
3337                    # Annotation Name
3338                    annotation_name = os.path.basename(annotation)
3339
3340                    if not annotation_fields:
3341                        annotation_fields = {"INFO": None}
3342
3343                    log.debug(f"Annotation '{annotation_name}'")
3344                    log.debug(
3345                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3346                    )
3347
3348                    # Create Database
3349                    database = Database(
3350                        database=annotation,
3351                        databases_folders=databases_folders,
3352                        assembly=assembly,
3353                    )
3354
3355                    # Find files
3356                    db_file = database.get_database()
3357                    db_file = full_path(db_file)
3358                    db_hdr_file = database.get_header_file()
3359                    db_hdr_file = full_path(db_hdr_file)
3360                    db_file_type = database.get_format()
3361                    db_tbi_file = f"{db_file}.tbi"
3362                    db_file_compressed = database.is_compressed()
3363
3364                    # Check if compressed
3365                    if not db_file_compressed:
3366                        log.error(
3367                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3368                        )
3369                        raise ValueError(
3370                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3371                        )
3372
3373                    # Check if indexed
3374                    if not os.path.exists(db_tbi_file):
3375                        log.error(
3376                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3377                        )
3378                        raise ValueError(
3379                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3380                        )
3381
3382                    # Check index - try to create if not exists
3383                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3384                        log.error("Annotation failed: database not valid")
3385                        log.error(f"Annotation annotation file: {db_file}")
3386                        log.error(f"Annotation annotation header: {db_hdr_file}")
3387                        log.error(f"Annotation annotation index: {db_tbi_file}")
3388                        raise ValueError(
3389                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3390                        )
3391                    else:
3392
3393                        log.debug(
3394                            f"Annotation '{annotation}' - file: "
3395                            + str(db_file)
3396                            + " and "
3397                            + str(db_hdr_file)
3398                        )
3399
3400                        # Load header as VCF object
3401                        db_hdr_vcf = Variants(input=db_hdr_file)
3402                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3403                        log.debug(
3404                            "Annotation database header: "
3405                            + str(db_hdr_vcf_header_infos)
3406                        )
3407
3408                        # For all fields in database
3409                        annotation_fields_full = False
3410                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3411                            annotation_fields = {
3412                                key: key for key in db_hdr_vcf_header_infos
3413                            }
3414                            log.debug(
3415                                "Annotation database header - All annotations added: "
3416                                + str(annotation_fields)
3417                            )
3418                            annotation_fields_full = True
3419
3420                        # # Create file for field rename
3421                        # log.debug("Create file for field rename")
3422                        # tmp_rename = NamedTemporaryFile(
3423                        #     prefix=self.get_prefix(),
3424                        #     dir=self.get_tmp_dir(),
3425                        #     suffix=".rename",
3426                        #     delete=False,
3427                        # )
3428                        # tmp_rename_name = tmp_rename.name
3429                        # tmp_files.append(tmp_rename_name)
3430
3431                        # Number of fields
3432                        nb_annotation_field = 0
3433                        annotation_list = []
3434                        annotation_infos_rename_list = []
3435
3436                        for annotation_field in annotation_fields:
3437
3438                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3439                            annotation_fields_new_name = annotation_fields.get(
3440                                annotation_field, annotation_field
3441                            )
3442                            if not annotation_fields_new_name:
3443                                annotation_fields_new_name = annotation_field
3444
3445                            # Check if field is in DB and if field is not elready in input data
3446                            if (
3447                                annotation_field in db_hdr_vcf.get_header().infos
3448                                and annotation_fields_new_name
3449                                not in self.get_header().infos
3450                            ):
3451
3452                                log.info(
3453                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3454                                )
3455
3456                                # BCFTools annotate param to rename fields
3457                                if annotation_field != annotation_fields_new_name:
3458                                    annotation_infos_rename_list.append(
3459                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3460                                    )
3461
3462                                # Add INFO field to header
3463                                db_hdr_vcf_header_infos_number = (
3464                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3465                                )
3466                                db_hdr_vcf_header_infos_type = (
3467                                    db_hdr_vcf_header_infos[annotation_field].type
3468                                    or "String"
3469                                )
3470                                db_hdr_vcf_header_infos_description = (
3471                                    db_hdr_vcf_header_infos[annotation_field].desc
3472                                    or f"{annotation_field} description"
3473                                )
3474                                db_hdr_vcf_header_infos_source = (
3475                                    db_hdr_vcf_header_infos[annotation_field].source
3476                                    or "unknown"
3477                                )
3478                                db_hdr_vcf_header_infos_version = (
3479                                    db_hdr_vcf_header_infos[annotation_field].version
3480                                    or "unknown"
3481                                )
3482
3483                                vcf_reader.infos[annotation_fields_new_name] = (
3484                                    vcf.parser._Info(
3485                                        annotation_fields_new_name,
3486                                        db_hdr_vcf_header_infos_number,
3487                                        db_hdr_vcf_header_infos_type,
3488                                        db_hdr_vcf_header_infos_description,
3489                                        db_hdr_vcf_header_infos_source,
3490                                        db_hdr_vcf_header_infos_version,
3491                                        self.code_type_map[
3492                                            db_hdr_vcf_header_infos_type
3493                                        ],
3494                                    )
3495                                )
3496
3497                                annotation_list.append(annotation_field)
3498
3499                                nb_annotation_field += 1
3500
3501                            else:
3502
3503                                if (
3504                                    annotation_field
3505                                    not in db_hdr_vcf.get_header().infos
3506                                ):
3507                                    log.warning(
3508                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3509                                    )
3510                                if (
3511                                    annotation_fields_new_name
3512                                    in self.get_header().infos
3513                                ):
3514                                    log.warning(
3515                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3516                                    )
3517
3518                        log.info(
3519                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3520                        )
3521
3522                        annotation_infos = ",".join(annotation_list)
3523
3524                        if annotation_infos != "":
3525
3526                            # Annotated VCF (and error file)
3527                            tmp_annotation_vcf_name = os.path.join(
3528                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3529                            )
3530                            tmp_annotation_vcf_name_err = (
3531                                tmp_annotation_vcf_name + ".err"
3532                            )
3533
3534                            # Add fields to annotate
3535                            if not annotation_fields_full:
3536                                annotation_infos_option = f"-info {annotation_infos}"
3537                            else:
3538                                annotation_infos_option = ""
3539
3540                            # Info fields rename
3541                            if annotation_infos_rename_list:
3542                                annotation_infos_rename = " -c " + ",".join(
3543                                    annotation_infos_rename_list
3544                                )
3545                            else:
3546                                annotation_infos_rename = ""
3547
3548                            # Annotate command
3549                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3550
3551                            # Add command
3552                            commands[command_annotate] = tmp_annotation_vcf_name
3553
3554                if commands:
3555
3556                    # Export VCF file
3557                    self.export_variant_vcf(
3558                        vcf_file=tmp_vcf_name,
3559                        remove_info=True,
3560                        add_samples=False,
3561                        index=True,
3562                    )
3563                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3564
3565                    # Num command
3566                    nb_command = 0
3567
3568                    # Annotate
3569                    for command_annotate in commands:
3570                        nb_command += 1
3571                        log.info(
3572                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3573                        )
3574                        log.debug(f"command_annotate={command_annotate}")
3575                        run_parallel_commands([command_annotate], threads)
3576
3577                        # Debug
3578                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3579
3580                        # Update variants
3581                        log.info(
3582                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3583                        )
3584                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3586    def annotation_bcftools(self, threads: int = None) -> None:
3587        """
3588        This function annotate with bcftools
3589
3590        :param threads: Number of threads to use
3591        :return: the value of the variable "return_value".
3592        """
3593
3594        # DEBUG
3595        log.debug("Start annotation with bcftools databases")
3596
3597        # Threads
3598        if not threads:
3599            threads = self.get_threads()
3600        log.debug("Threads: " + str(threads))
3601
3602        # Config
3603        config = self.get_config()
3604        log.debug("Config: " + str(config))
3605
3606        # DEBUG
3607        delete_tmp = True
3608        if self.get_config().get("verbosity", "warning") in ["debug"]:
3609            delete_tmp = False
3610            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3611
3612        # Config - BCFTools bin command
3613        bcftools_bin_command = get_bin_command(
3614            bin="bcftools",
3615            tool="bcftools",
3616            bin_type="bin",
3617            config=config,
3618            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3619        )
3620        if not bcftools_bin_command:
3621            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3622            log.error(msg_err)
3623            raise ValueError(msg_err)
3624
3625        # Config - BCFTools databases folders
3626        databases_folders = set(
3627            self.get_config()
3628            .get("folders", {})
3629            .get("databases", {})
3630            .get("annotations", ["."])
3631            + self.get_config()
3632            .get("folders", {})
3633            .get("databases", {})
3634            .get("bcftools", ["."])
3635        )
3636        log.debug("Databases annotations: " + str(databases_folders))
3637
3638        # Param
3639        annotations = (
3640            self.get_param()
3641            .get("annotation", {})
3642            .get("bcftools", {})
3643            .get("annotations", None)
3644        )
3645        log.debug("Annotations: " + str(annotations))
3646
3647        # Assembly
3648        assembly = self.get_param().get(
3649            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3650        )
3651
3652        # Data
3653        table_variants = self.get_table_variants()
3654
3655        # Check if not empty
3656        log.debug("Check if not empty")
3657        sql_query_chromosomes = (
3658            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3659        )
3660        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3661        if not sql_query_chromosomes_df["count"][0]:
3662            log.info(f"VCF empty")
3663            return
3664
3665        # Export in VCF
3666        log.debug("Create initial file to annotate")
3667        tmp_vcf = NamedTemporaryFile(
3668            prefix=self.get_prefix(),
3669            dir=self.get_tmp_dir(),
3670            suffix=".vcf.gz",
3671            delete=False,
3672        )
3673        tmp_vcf_name = tmp_vcf.name
3674
3675        # VCF header
3676        vcf_reader = self.get_header()
3677        log.debug("Initial header: " + str(vcf_reader.infos))
3678
3679        # Existing annotations
3680        for vcf_annotation in self.get_header().infos:
3681
3682            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3683            log.debug(
3684                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3685            )
3686
3687        if annotations:
3688
3689            tmp_ann_vcf_list = []
3690            commands = []
3691            tmp_files = []
3692            err_files = []
3693
3694            for annotation in annotations:
3695                annotation_fields = annotations[annotation]
3696
3697                # Annotation Name
3698                annotation_name = os.path.basename(annotation)
3699
3700                if not annotation_fields:
3701                    annotation_fields = {"INFO": None}
3702
3703                log.debug(f"Annotation '{annotation_name}'")
3704                log.debug(
3705                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3706                )
3707
3708                # Create Database
3709                database = Database(
3710                    database=annotation,
3711                    databases_folders=databases_folders,
3712                    assembly=assembly,
3713                )
3714
3715                # Find files
3716                db_file = database.get_database()
3717                db_file = full_path(db_file)
3718                db_hdr_file = database.get_header_file()
3719                db_hdr_file = full_path(db_hdr_file)
3720                db_file_type = database.get_format()
3721                db_tbi_file = f"{db_file}.tbi"
3722                db_file_compressed = database.is_compressed()
3723
3724                # Check if compressed
3725                if not db_file_compressed:
3726                    log.error(
3727                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3728                    )
3729                    raise ValueError(
3730                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3731                    )
3732
3733                # Check if indexed
3734                if not os.path.exists(db_tbi_file):
3735                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3736                    raise ValueError(
3737                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3738                    )
3739
3740                # Check index - try to create if not exists
3741                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3742                    log.error("Annotation failed: database not valid")
3743                    log.error(f"Annotation annotation file: {db_file}")
3744                    log.error(f"Annotation annotation header: {db_hdr_file}")
3745                    log.error(f"Annotation annotation index: {db_tbi_file}")
3746                    raise ValueError(
3747                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3748                    )
3749                else:
3750
3751                    log.debug(
3752                        f"Annotation '{annotation}' - file: "
3753                        + str(db_file)
3754                        + " and "
3755                        + str(db_hdr_file)
3756                    )
3757
3758                    # Load header as VCF object
3759                    db_hdr_vcf = Variants(input=db_hdr_file)
3760                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3761                    log.debug(
3762                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3763                    )
3764
3765                    # For all fields in database
3766                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3767                        annotation_fields = {
3768                            key: key for key in db_hdr_vcf_header_infos
3769                        }
3770                        log.debug(
3771                            "Annotation database header - All annotations added: "
3772                            + str(annotation_fields)
3773                        )
3774
3775                    # Number of fields
3776                    nb_annotation_field = 0
3777                    annotation_list = []
3778
3779                    for annotation_field in annotation_fields:
3780
3781                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3782                        annotation_fields_new_name = annotation_fields.get(
3783                            annotation_field, annotation_field
3784                        )
3785                        if not annotation_fields_new_name:
3786                            annotation_fields_new_name = annotation_field
3787
3788                        # Check if field is in DB and if field is not elready in input data
3789                        if (
3790                            annotation_field in db_hdr_vcf.get_header().infos
3791                            and annotation_fields_new_name
3792                            not in self.get_header().infos
3793                        ):
3794
3795                            log.info(
3796                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3797                            )
3798
3799                            # Add INFO field to header
3800                            db_hdr_vcf_header_infos_number = (
3801                                db_hdr_vcf_header_infos[annotation_field].num or "."
3802                            )
3803                            db_hdr_vcf_header_infos_type = (
3804                                db_hdr_vcf_header_infos[annotation_field].type
3805                                or "String"
3806                            )
3807                            db_hdr_vcf_header_infos_description = (
3808                                db_hdr_vcf_header_infos[annotation_field].desc
3809                                or f"{annotation_field} description"
3810                            )
3811                            db_hdr_vcf_header_infos_source = (
3812                                db_hdr_vcf_header_infos[annotation_field].source
3813                                or "unknown"
3814                            )
3815                            db_hdr_vcf_header_infos_version = (
3816                                db_hdr_vcf_header_infos[annotation_field].version
3817                                or "unknown"
3818                            )
3819
3820                            vcf_reader.infos[annotation_fields_new_name] = (
3821                                vcf.parser._Info(
3822                                    annotation_fields_new_name,
3823                                    db_hdr_vcf_header_infos_number,
3824                                    db_hdr_vcf_header_infos_type,
3825                                    db_hdr_vcf_header_infos_description,
3826                                    db_hdr_vcf_header_infos_source,
3827                                    db_hdr_vcf_header_infos_version,
3828                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3829                                )
3830                            )
3831
3832                            # annotation_list.append(annotation_field)
3833                            if annotation_field != annotation_fields_new_name:
3834                                annotation_list.append(
3835                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3836                                )
3837                            else:
3838                                annotation_list.append(annotation_field)
3839
3840                            nb_annotation_field += 1
3841
3842                        else:
3843
3844                            if annotation_field not in db_hdr_vcf.get_header().infos:
3845                                log.warning(
3846                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3847                                )
3848                            if annotation_fields_new_name in self.get_header().infos:
3849                                log.warning(
3850                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3851                                )
3852
3853                    log.info(
3854                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3855                    )
3856
3857                    annotation_infos = ",".join(annotation_list)
3858
3859                    if annotation_infos != "":
3860
3861                        # Protect header for bcftools (remove "#CHROM" and variants line)
3862                        log.debug("Protect Header file - remove #CHROM line if exists")
3863                        tmp_header_vcf = NamedTemporaryFile(
3864                            prefix=self.get_prefix(),
3865                            dir=self.get_tmp_dir(),
3866                            suffix=".hdr",
3867                            delete=False,
3868                        )
3869                        tmp_header_vcf_name = tmp_header_vcf.name
3870                        tmp_files.append(tmp_header_vcf_name)
3871                        # Command
3872                        if db_hdr_file.endswith(".gz"):
3873                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3874                        else:
3875                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3876                        # Run
3877                        run_parallel_commands([command_extract_header], 1)
3878
3879                        # Find chomosomes
3880                        log.debug("Find chromosomes ")
3881                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3882                        sql_query_chromosomes_df = self.get_query_to_df(
3883                            sql_query_chromosomes
3884                        )
3885                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3886
3887                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3888
3889                        # BED columns in the annotation file
3890                        if db_file_type in ["bed"]:
3891                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3892
3893                        for chrom in chomosomes_list:
3894
3895                            # Create BED on initial VCF
3896                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3897                            tmp_bed = NamedTemporaryFile(
3898                                prefix=self.get_prefix(),
3899                                dir=self.get_tmp_dir(),
3900                                suffix=".bed",
3901                                delete=False,
3902                            )
3903                            tmp_bed_name = tmp_bed.name
3904                            tmp_files.append(tmp_bed_name)
3905
3906                            # Detecte regions
3907                            log.debug(
3908                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3909                            )
3910                            window = 1000000
3911                            sql_query_intervals_for_bed = f"""
3912                                SELECT  \"#CHROM\",
3913                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3914                                        \"POS\"+{window}
3915                                FROM {table_variants} as table_variants
3916                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3917                            """
3918                            regions = self.conn.execute(
3919                                sql_query_intervals_for_bed
3920                            ).fetchall()
3921                            merged_regions = merge_regions(regions)
3922                            log.debug(
3923                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3924                            )
3925
3926                            header = ["#CHROM", "START", "END"]
3927                            with open(tmp_bed_name, "w") as f:
3928                                # Write the header with tab delimiter
3929                                f.write("\t".join(header) + "\n")
3930                                for d in merged_regions:
3931                                    # Write each data row with tab delimiter
3932                                    f.write("\t".join(map(str, d)) + "\n")
3933
3934                            # Tmp files
3935                            tmp_annotation_vcf = NamedTemporaryFile(
3936                                prefix=self.get_prefix(),
3937                                dir=self.get_tmp_dir(),
3938                                suffix=".vcf.gz",
3939                                delete=False,
3940                            )
3941                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3942                            tmp_files.append(tmp_annotation_vcf_name)
3943                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3944                            tmp_annotation_vcf_name_err = (
3945                                tmp_annotation_vcf_name + ".err"
3946                            )
3947                            err_files.append(tmp_annotation_vcf_name_err)
3948
3949                            # Annotate Command
3950                            log.debug(
3951                                f"Annotation '{annotation}' - add bcftools command"
3952                            )
3953
3954                            # Command
3955                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3956
3957                            # Add command
3958                            commands.append(command_annotate)
3959
3960            # if some commands
3961            if commands:
3962
3963                # Export VCF file
3964                self.export_variant_vcf(
3965                    vcf_file=tmp_vcf_name,
3966                    remove_info=True,
3967                    add_samples=False,
3968                    index=True,
3969                )
3970
3971                # Threads
3972                # calculate threads for annotated commands
3973                if commands:
3974                    threads_bcftools_annotate = round(threads / len(commands))
3975                else:
3976                    threads_bcftools_annotate = 1
3977
3978                if not threads_bcftools_annotate:
3979                    threads_bcftools_annotate = 1
3980
3981                # Add threads option to bcftools commands
3982                if threads_bcftools_annotate > 1:
3983                    commands_threaded = []
3984                    for command in commands:
3985                        commands_threaded.append(
3986                            command.replace(
3987                                f"{bcftools_bin_command} annotate ",
3988                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3989                            )
3990                        )
3991                    commands = commands_threaded
3992
3993                # Command annotation multithreading
3994                log.debug(f"Annotation - Annotation commands: " + str(commands))
3995                log.info(
3996                    f"Annotation - Annotation multithreaded in "
3997                    + str(len(commands))
3998                    + " commands"
3999                )
4000
4001                run_parallel_commands(commands, threads)
4002
4003                # Merge
4004                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4005
4006                if tmp_ann_vcf_list_cmd:
4007
4008                    # Tmp file
4009                    tmp_annotate_vcf = NamedTemporaryFile(
4010                        prefix=self.get_prefix(),
4011                        dir=self.get_tmp_dir(),
4012                        suffix=".vcf.gz",
4013                        delete=True,
4014                    )
4015                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4016                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4017                    err_files.append(tmp_annotate_vcf_name_err)
4018
4019                    # Tmp file remove command
4020                    tmp_files_remove_command = ""
4021                    if tmp_files:
4022                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4023
4024                    # Command merge
4025                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4026                    log.info(
4027                        f"Annotation - Annotation merging "
4028                        + str(len(commands))
4029                        + " annotated files"
4030                    )
4031                    log.debug(f"Annotation - merge command: {merge_command}")
4032                    run_parallel_commands([merge_command], 1)
4033
4034                    # Error messages
4035                    log.info(f"Error/Warning messages:")
4036                    error_message_command_all = []
4037                    error_message_command_warning = []
4038                    error_message_command_err = []
4039                    for err_file in err_files:
4040                        with open(err_file, "r") as f:
4041                            for line in f:
4042                                message = line.strip()
4043                                error_message_command_all.append(message)
4044                                if line.startswith("[W::"):
4045                                    error_message_command_warning.append(message)
4046                                if line.startswith("[E::"):
4047                                    error_message_command_err.append(
4048                                        f"{err_file}: " + message
4049                                    )
4050                    # log info
4051                    for message in list(
4052                        set(error_message_command_err + error_message_command_warning)
4053                    ):
4054                        log.info(f"   {message}")
4055                    # debug info
4056                    for message in list(set(error_message_command_all)):
4057                        log.debug(f"   {message}")
4058                    # failed
4059                    if len(error_message_command_err):
4060                        log.error("Annotation failed: Error in commands")
4061                        raise ValueError("Annotation failed: Error in commands")
4062
4063                    # Update variants
4064                    log.info(f"Annotation - Updating...")
4065                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4067    def annotation_exomiser(self, threads: int = None) -> None:
4068        """
4069        This function annotate with Exomiser
4070
4071        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4072        - "analysis" (dict/file):
4073            Full analysis dictionnary parameters (see Exomiser docs).
4074            Either a dict, or a file in JSON or YAML format.
4075            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4076            Default : None
4077        - "preset" (string):
4078            Analysis preset (available in config folder).
4079            Used if no full "analysis" is provided.
4080            Default: "exome"
4081        - "phenopacket" (dict/file):
4082            Samples and phenotipic features parameters (see Exomiser docs).
4083            Either a dict, or a file in JSON or YAML format.
4084            Default: None
4085        - "subject" (dict):
4086            Sample parameters (see Exomiser docs).
4087            Example:
4088                "subject":
4089                    {
4090                        "id": "ISDBM322017",
4091                        "sex": "FEMALE"
4092                    }
4093            Default: None
4094        - "sample" (string):
4095            Sample name to construct "subject" section:
4096                "subject":
4097                    {
4098                        "id": "<sample>",
4099                        "sex": "UNKNOWN_SEX"
4100                    }
4101            Default: None
4102        - "phenotypicFeatures" (dict)
4103            Phenotypic features to construct "subject" section.
4104            Example:
4105                "phenotypicFeatures":
4106                    [
4107                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4108                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4109                    ]
4110        - "hpo" (list)
4111            List of HPO ids as phenotypic features.
4112            Example:
4113                "hpo": ['0001156', '0001363', '0011304', '0010055']
4114            Default: []
4115        - "outputOptions" (dict):
4116            Output options (see Exomiser docs).
4117            Default:
4118                "output_options" =
4119                    {
4120                        "outputContributingVariantsOnly": False,
4121                        "numGenes": 0,
4122                        "outputFormats": ["TSV_VARIANT", "VCF"]
4123                    }
4124        - "transcript_source" (string):
4125            Transcript source (either "refseq", "ucsc", "ensembl")
4126            Default: "refseq"
4127        - "exomiser_to_info" (boolean):
4128            Add exomiser TSV file columns as INFO fields in VCF.
4129            Default: False
4130        - "release" (string):
4131            Exomise database release.
4132            If not exists, database release will be downloaded (take a while).
4133            Default: None (provided by application.properties configuration file)
4134        - "exomiser_application_properties" (file):
4135            Exomiser configuration file (see Exomiser docs).
4136            Useful to automatically download databases (especially for specific genome databases).
4137
4138        Notes:
4139        - If no sample in parameters, first sample in VCF will be chosen
4140        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4141
4142        :param threads: The number of threads to use
4143        :return: None.
4144        """
4145
4146        # DEBUG
4147        log.debug("Start annotation with Exomiser databases")
4148
4149        # Threads
4150        if not threads:
4151            threads = self.get_threads()
4152        log.debug("Threads: " + str(threads))
4153
4154        # Config
4155        config = self.get_config()
4156        log.debug("Config: " + str(config))
4157
4158        # Config - Folders - Databases
4159        databases_folders = (
4160            config.get("folders", {})
4161            .get("databases", {})
4162            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4163        )
4164        databases_folders = full_path(databases_folders)
4165        if not os.path.exists(databases_folders):
4166            log.error(f"Databases annotations: {databases_folders} NOT found")
4167        log.debug("Databases annotations: " + str(databases_folders))
4168
4169        # Config - Exomiser
4170        exomiser_bin_command = get_bin_command(
4171            bin="exomiser-cli*.jar",
4172            tool="exomiser",
4173            bin_type="jar",
4174            config=config,
4175            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4176        )
4177        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4178        if not exomiser_bin_command:
4179            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4180            log.error(msg_err)
4181            raise ValueError(msg_err)
4182
4183        # Param
4184        param = self.get_param()
4185        log.debug("Param: " + str(param))
4186
4187        # Param - Exomiser
4188        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4189        log.debug(f"Param Exomiser: {param_exomiser}")
4190
4191        # Param - Assembly
4192        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4193        log.debug("Assembly: " + str(assembly))
4194
4195        # Data
4196        table_variants = self.get_table_variants()
4197
4198        # Check if not empty
4199        log.debug("Check if not empty")
4200        sql_query_chromosomes = (
4201            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4202        )
4203        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4204            log.info(f"VCF empty")
4205            return False
4206
4207        # VCF header
4208        vcf_reader = self.get_header()
4209        log.debug("Initial header: " + str(vcf_reader.infos))
4210
4211        # Samples
4212        samples = self.get_header_sample_list()
4213        if not samples:
4214            log.error("No Samples in VCF")
4215            return False
4216        log.debug(f"Samples: {samples}")
4217
4218        # Memory limit
4219        memory_limit = self.get_memory("8G")
4220        log.debug(f"memory_limit: {memory_limit}")
4221
4222        # Exomiser java options
4223        exomiser_java_options = (
4224            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4225        )
4226        log.debug(f"Exomiser java options: {exomiser_java_options}")
4227
4228        # Download Exomiser (if not exists)
4229        exomiser_release = param_exomiser.get("release", None)
4230        exomiser_application_properties = param_exomiser.get(
4231            "exomiser_application_properties", None
4232        )
4233        databases_download_exomiser(
4234            assemblies=[assembly],
4235            exomiser_folder=databases_folders,
4236            exomiser_release=exomiser_release,
4237            exomiser_phenotype_release=exomiser_release,
4238            exomiser_application_properties=exomiser_application_properties,
4239        )
4240
4241        # Force annotation
4242        force_update_annotation = True
4243
4244        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4245            log.debug("Start annotation Exomiser")
4246
4247            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4248
4249                # tmp_dir = "/tmp/exomiser"
4250
4251                ### ANALYSIS ###
4252                ################
4253
4254                # Create analysis.json through analysis dict
4255                # either analysis in param or by default
4256                # depending on preset exome/genome)
4257
4258                # Init analysis dict
4259                param_exomiser_analysis_dict = {}
4260
4261                # analysis from param
4262                param_exomiser_analysis = param_exomiser.get("analysis", {})
4263                param_exomiser_analysis = full_path(param_exomiser_analysis)
4264
4265                # If analysis in param -> load anlaysis json
4266                if param_exomiser_analysis:
4267
4268                    # If param analysis is a file and exists
4269                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4270                        param_exomiser_analysis
4271                    ):
4272                        # Load analysis file into analysis dict (either yaml or json)
4273                        with open(param_exomiser_analysis) as json_file:
4274                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4275
4276                    # If param analysis is a dict
4277                    elif isinstance(param_exomiser_analysis, dict):
4278                        # Load analysis dict into analysis dict (either yaml or json)
4279                        param_exomiser_analysis_dict = param_exomiser_analysis
4280
4281                    # Error analysis type
4282                    else:
4283                        log.error(f"Analysis type unknown. Check param file.")
4284                        raise ValueError(f"Analysis type unknown. Check param file.")
4285
4286                # Case no input analysis config file/dict
4287                # Use preset (exome/genome) to open default config file
4288                if not param_exomiser_analysis_dict:
4289
4290                    # default preset
4291                    default_preset = "exome"
4292
4293                    # Get param preset or default preset
4294                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4295
4296                    # Try to find if preset is a file
4297                    if os.path.exists(param_exomiser_preset):
4298                        # Preset file is provided in full path
4299                        param_exomiser_analysis_default_config_file = (
4300                            param_exomiser_preset
4301                        )
4302                    # elif os.path.exists(full_path(param_exomiser_preset)):
4303                    #     # Preset file is provided in full path
4304                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4305                    elif os.path.exists(
4306                        os.path.join(folder_config, param_exomiser_preset)
4307                    ):
4308                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4309                        param_exomiser_analysis_default_config_file = os.path.join(
4310                            folder_config, param_exomiser_preset
4311                        )
4312                    else:
4313                        # Construct preset file
4314                        param_exomiser_analysis_default_config_file = os.path.join(
4315                            folder_config,
4316                            f"preset-{param_exomiser_preset}-analysis.json",
4317                        )
4318
4319                    # If preset file exists
4320                    param_exomiser_analysis_default_config_file = full_path(
4321                        param_exomiser_analysis_default_config_file
4322                    )
4323                    if os.path.exists(param_exomiser_analysis_default_config_file):
4324                        # Load prest file into analysis dict (either yaml or json)
4325                        with open(
4326                            param_exomiser_analysis_default_config_file
4327                        ) as json_file:
4328                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4329                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4330                                json_file
4331                            )
4332
4333                    # Error preset file
4334                    else:
4335                        log.error(
4336                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4337                        )
4338                        raise ValueError(
4339                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4340                        )
4341
4342                # If no analysis dict created
4343                if not param_exomiser_analysis_dict:
4344                    log.error(f"No analysis config")
4345                    raise ValueError(f"No analysis config")
4346
4347                # Log
4348                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4349
4350                ### PHENOPACKET ###
4351                ###################
4352
4353                # If no PhenoPacket in analysis dict -> check in param
4354                if "phenopacket" not in param_exomiser_analysis_dict:
4355
4356                    # If PhenoPacket in param -> load anlaysis json
4357                    if param_exomiser.get("phenopacket", None):
4358
4359                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4360                        param_exomiser_phenopacket = full_path(
4361                            param_exomiser_phenopacket
4362                        )
4363
4364                        # If param phenopacket is a file and exists
4365                        if isinstance(
4366                            param_exomiser_phenopacket, str
4367                        ) and os.path.exists(param_exomiser_phenopacket):
4368                            # Load phenopacket file into analysis dict (either yaml or json)
4369                            with open(param_exomiser_phenopacket) as json_file:
4370                                param_exomiser_analysis_dict["phenopacket"] = (
4371                                    yaml.safe_load(json_file)
4372                                )
4373
4374                        # If param phenopacket is a dict
4375                        elif isinstance(param_exomiser_phenopacket, dict):
4376                            # Load phenopacket dict into analysis dict (either yaml or json)
4377                            param_exomiser_analysis_dict["phenopacket"] = (
4378                                param_exomiser_phenopacket
4379                            )
4380
4381                        # Error phenopacket type
4382                        else:
4383                            log.error(f"Phenopacket type unknown. Check param file.")
4384                            raise ValueError(
4385                                f"Phenopacket type unknown. Check param file."
4386                            )
4387
4388                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4389                if "phenopacket" not in param_exomiser_analysis_dict:
4390
4391                    # Init PhenoPacket
4392                    param_exomiser_analysis_dict["phenopacket"] = {
4393                        "id": "analysis",
4394                        "proband": {},
4395                    }
4396
4397                    ### Add subject ###
4398
4399                    # If subject exists
4400                    param_exomiser_subject = param_exomiser.get("subject", {})
4401
4402                    # If subject not exists -> found sample ID
4403                    if not param_exomiser_subject:
4404
4405                        # Found sample ID in param
4406                        sample = param_exomiser.get("sample", None)
4407
4408                        # Find sample ID (first sample)
4409                        if not sample:
4410                            sample_list = self.get_header_sample_list()
4411                            if len(sample_list) > 0:
4412                                sample = sample_list[0]
4413                            else:
4414                                log.error(f"No sample found")
4415                                raise ValueError(f"No sample found")
4416
4417                        # Create subject
4418                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4419
4420                    # Add to dict
4421                    param_exomiser_analysis_dict["phenopacket"][
4422                        "subject"
4423                    ] = param_exomiser_subject
4424
4425                    ### Add "phenotypicFeatures" ###
4426
4427                    # If phenotypicFeatures exists
4428                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4429                        "phenotypicFeatures", []
4430                    )
4431
4432                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4433                    if not param_exomiser_phenotypicfeatures:
4434
4435                        # Found HPO in param
4436                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4437
4438                        # Split HPO if list in string format separated by comma
4439                        if isinstance(param_exomiser_hpo, str):
4440                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4441
4442                        # Create HPO list
4443                        for hpo in param_exomiser_hpo:
4444                            hpo_clean = re.sub("[^0-9]", "", hpo)
4445                            param_exomiser_phenotypicfeatures.append(
4446                                {
4447                                    "type": {
4448                                        "id": f"HP:{hpo_clean}",
4449                                        "label": f"HP:{hpo_clean}",
4450                                    }
4451                                }
4452                            )
4453
4454                    # Add to dict
4455                    param_exomiser_analysis_dict["phenopacket"][
4456                        "phenotypicFeatures"
4457                    ] = param_exomiser_phenotypicfeatures
4458
4459                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4460                    if not param_exomiser_phenotypicfeatures:
4461                        for step in param_exomiser_analysis_dict.get(
4462                            "analysis", {}
4463                        ).get("steps", []):
4464                            if "hiPhivePrioritiser" in step:
4465                                param_exomiser_analysis_dict.get("analysis", {}).get(
4466                                    "steps", []
4467                                ).remove(step)
4468
4469                ### Add Input File ###
4470
4471                # Initial file name and htsFiles
4472                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4473                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4474                    {
4475                        "uri": tmp_vcf_name,
4476                        "htsFormat": "VCF",
4477                        "genomeAssembly": assembly,
4478                    }
4479                ]
4480
4481                ### Add metaData ###
4482
4483                # If metaData not in analysis dict
4484                if "metaData" not in param_exomiser_analysis_dict:
4485                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4486                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4487                        "createdBy": "howard",
4488                        "phenopacketSchemaVersion": 1,
4489                    }
4490
4491                ### OutputOptions ###
4492
4493                # Init output result folder
4494                output_results = os.path.join(tmp_dir, "results")
4495
4496                # If no outputOptions in analysis dict
4497                if "outputOptions" not in param_exomiser_analysis_dict:
4498
4499                    # default output formats
4500                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4501
4502                    # Get outputOptions in param
4503                    output_options = param_exomiser.get("outputOptions", None)
4504
4505                    # If no output_options in param -> check
4506                    if not output_options:
4507                        output_options = {
4508                            "outputContributingVariantsOnly": False,
4509                            "numGenes": 0,
4510                            "outputFormats": defaut_output_formats,
4511                        }
4512
4513                    # Replace outputDirectory in output options
4514                    output_options["outputDirectory"] = output_results
4515                    output_options["outputFileName"] = "howard"
4516
4517                    # Add outputOptions in analysis dict
4518                    param_exomiser_analysis_dict["outputOptions"] = output_options
4519
4520                else:
4521
4522                    # Replace output_results and output format (if exists in param)
4523                    param_exomiser_analysis_dict["outputOptions"][
4524                        "outputDirectory"
4525                    ] = output_results
4526                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4527                        list(
4528                            set(
4529                                param_exomiser_analysis_dict.get(
4530                                    "outputOptions", {}
4531                                ).get("outputFormats", [])
4532                                + ["TSV_VARIANT", "VCF"]
4533                            )
4534                        )
4535                    )
4536
4537                # log
4538                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4539
4540                ### ANALYSIS FILE ###
4541                #####################
4542
4543                ### Full JSON analysis config file ###
4544
4545                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4546                with open(exomiser_analysis, "w") as fp:
4547                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4548
4549                ### SPLIT analysis and sample config files
4550
4551                # Splitted analysis dict
4552                param_exomiser_analysis_dict_for_split = (
4553                    param_exomiser_analysis_dict.copy()
4554                )
4555
4556                # Phenopacket JSON file
4557                exomiser_analysis_phenopacket = os.path.join(
4558                    tmp_dir, "analysis_phenopacket.json"
4559                )
4560                with open(exomiser_analysis_phenopacket, "w") as fp:
4561                    json.dump(
4562                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4563                        fp,
4564                        indent=4,
4565                    )
4566
4567                # Analysis JSON file without Phenopacket parameters
4568                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4569                exomiser_analysis_analysis = os.path.join(
4570                    tmp_dir, "analysis_analysis.json"
4571                )
4572                with open(exomiser_analysis_analysis, "w") as fp:
4573                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4574
4575                ### INITAL VCF file ###
4576                #######################
4577
4578                ### Create list of samples to use and include inti initial VCF file ####
4579
4580                # Subject (main sample)
4581                # Get sample ID in analysis dict
4582                sample_subject = (
4583                    param_exomiser_analysis_dict.get("phenopacket", {})
4584                    .get("subject", {})
4585                    .get("id", None)
4586                )
4587                sample_proband = (
4588                    param_exomiser_analysis_dict.get("phenopacket", {})
4589                    .get("proband", {})
4590                    .get("subject", {})
4591                    .get("id", None)
4592                )
4593                sample = []
4594                if sample_subject:
4595                    sample.append(sample_subject)
4596                if sample_proband:
4597                    sample.append(sample_proband)
4598
4599                # Get sample ID within Pedigree
4600                pedigree_persons_list = (
4601                    param_exomiser_analysis_dict.get("phenopacket", {})
4602                    .get("pedigree", {})
4603                    .get("persons", {})
4604                )
4605
4606                # Create list with all sample ID in pedigree (if exists)
4607                pedigree_persons = []
4608                for person in pedigree_persons_list:
4609                    pedigree_persons.append(person.get("individualId"))
4610
4611                # Concat subject sample ID and samples ID in pedigreesamples
4612                samples = list(set(sample + pedigree_persons))
4613
4614                # Check if sample list is not empty
4615                if not samples:
4616                    log.error(f"No samples found")
4617                    raise ValueError(f"No samples found")
4618
4619                # Create VCF with sample (either sample in param or first one by default)
4620                # Export VCF file
4621                self.export_variant_vcf(
4622                    vcf_file=tmp_vcf_name,
4623                    remove_info=True,
4624                    add_samples=True,
4625                    list_samples=samples,
4626                    index=False,
4627                )
4628
4629                ### Execute Exomiser ###
4630                ########################
4631
4632                # Init command
4633                exomiser_command = ""
4634
4635                # Command exomiser options
4636                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4637
4638                # Release
4639                exomiser_release = param_exomiser.get("release", None)
4640                if exomiser_release:
4641                    # phenotype data version
4642                    exomiser_options += (
4643                        f" --exomiser.phenotype.data-version={exomiser_release} "
4644                    )
4645                    # data version
4646                    exomiser_options += (
4647                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4648                    )
4649                    # variant white list
4650                    variant_white_list_file = (
4651                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4652                    )
4653                    if os.path.exists(
4654                        os.path.join(
4655                            databases_folders, assembly, variant_white_list_file
4656                        )
4657                    ):
4658                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4659
4660                # transcript_source
4661                transcript_source = param_exomiser.get(
4662                    "transcript_source", None
4663                )  # ucsc, refseq, ensembl
4664                if transcript_source:
4665                    exomiser_options += (
4666                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4667                    )
4668
4669                # If analysis contain proband param
4670                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4671                    "proband", {}
4672                ):
4673                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4674
4675                # If no proband (usually uniq sample)
4676                else:
4677                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4678
4679                # Log
4680                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4681
4682                # Run command
4683                result = subprocess.call(
4684                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4685                )
4686                if result:
4687                    log.error("Exomiser command failed")
4688                    raise ValueError("Exomiser command failed")
4689
4690                ### RESULTS ###
4691                ###############
4692
4693                ### Annotate with TSV fields ###
4694
4695                # Init result tsv file
4696                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4697
4698                # Init result tsv file
4699                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4700
4701                # Parse TSV file and explode columns in INFO field
4702                if exomiser_to_info and os.path.exists(output_results_tsv):
4703
4704                    # Log
4705                    log.debug("Exomiser columns to VCF INFO field")
4706
4707                    # Retrieve columns and types
4708                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4709                    output_results_tsv_df = self.get_query_to_df(query)
4710                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4711
4712                    # Init concat fields for update
4713                    sql_query_update_concat_fields = []
4714
4715                    # Fields to avoid
4716                    fields_to_avoid = [
4717                        "CONTIG",
4718                        "START",
4719                        "END",
4720                        "REF",
4721                        "ALT",
4722                        "QUAL",
4723                        "FILTER",
4724                        "GENOTYPE",
4725                    ]
4726
4727                    # List all columns to add into header
4728                    for header_column in output_results_tsv_columns:
4729
4730                        # If header column is enable
4731                        if header_column not in fields_to_avoid:
4732
4733                            # Header info type
4734                            header_info_type = "String"
4735                            header_column_df = output_results_tsv_df[header_column]
4736                            header_column_df_dtype = header_column_df.dtype
4737                            if header_column_df_dtype == object:
4738                                if (
4739                                    pd.to_numeric(header_column_df, errors="coerce")
4740                                    .notnull()
4741                                    .all()
4742                                ):
4743                                    header_info_type = "Float"
4744                            else:
4745                                header_info_type = "Integer"
4746
4747                            # Header info
4748                            characters_to_validate = ["-"]
4749                            pattern = "[" + "".join(characters_to_validate) + "]"
4750                            header_info_name = re.sub(
4751                                pattern,
4752                                "_",
4753                                f"Exomiser_{header_column}".replace("#", ""),
4754                            )
4755                            header_info_number = "."
4756                            header_info_description = (
4757                                f"Exomiser {header_column} annotation"
4758                            )
4759                            header_info_source = "Exomiser"
4760                            header_info_version = "unknown"
4761                            header_info_code = CODE_TYPE_MAP[header_info_type]
4762                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4763                                header_info_name,
4764                                header_info_number,
4765                                header_info_type,
4766                                header_info_description,
4767                                header_info_source,
4768                                header_info_version,
4769                                header_info_code,
4770                            )
4771
4772                            # Add field to add for update to concat fields
4773                            sql_query_update_concat_fields.append(
4774                                f"""
4775                                CASE
4776                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4777                                    THEN concat(
4778                                        '{header_info_name}=',
4779                                        table_parquet."{header_column}",
4780                                        ';'
4781                                        )
4782
4783                                    ELSE ''
4784                                END
4785                            """
4786                            )
4787
4788                    # Update query
4789                    sql_query_update = f"""
4790                        UPDATE {table_variants} as table_variants
4791                            SET INFO = concat(
4792                                            CASE
4793                                                WHEN INFO NOT IN ('', '.')
4794                                                THEN INFO
4795                                                ELSE ''
4796                                            END,
4797                                            CASE
4798                                                WHEN table_variants.INFO NOT IN ('','.')
4799                                                THEN ';'
4800                                                ELSE ''
4801                                            END,
4802                                            (
4803                                            SELECT 
4804                                                concat(
4805                                                    {",".join(sql_query_update_concat_fields)}
4806                                                )
4807                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4808                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4809                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4810                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4811                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4812                                            )
4813                                        )
4814                            ;
4815                        """
4816
4817                    # Update
4818                    self.conn.execute(sql_query_update)
4819
4820                ### Annotate with VCF INFO field ###
4821
4822                # Init result VCF file
4823                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4824
4825                # If VCF exists
4826                if os.path.exists(output_results_vcf):
4827
4828                    # Log
4829                    log.debug("Exomiser result VCF update variants")
4830
4831                    # Find Exomiser INFO field annotation in header
4832                    with gzip.open(output_results_vcf, "rt") as f:
4833                        header_list = self.read_vcf_header(f)
4834                    exomiser_vcf_header = vcf.Reader(
4835                        io.StringIO("\n".join(header_list))
4836                    )
4837
4838                    # Add annotation INFO field to header
4839                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4840
4841                    # Update variants with VCF
4842                    self.update_from_vcf(output_results_vcf)
4843
4844        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4846    def annotation_snpeff(self, threads: int = None) -> None:
4847        """
4848        This function annotate with snpEff
4849
4850        :param threads: The number of threads to use
4851        :return: the value of the variable "return_value".
4852        """
4853
4854        # DEBUG
4855        log.debug("Start annotation with snpeff databases")
4856
4857        # Threads
4858        if not threads:
4859            threads = self.get_threads()
4860        log.debug("Threads: " + str(threads))
4861
4862        # DEBUG
4863        delete_tmp = True
4864        if self.get_config().get("verbosity", "warning") in ["debug"]:
4865            delete_tmp = False
4866            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4867
4868        # Config
4869        config = self.get_config()
4870        log.debug("Config: " + str(config))
4871
4872        # Config - Folders - Databases
4873        databases_folders = (
4874            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4875        )
4876        log.debug("Databases annotations: " + str(databases_folders))
4877
4878        # # Config - Java
4879        # java_bin = get_bin(
4880        #     tool="java",
4881        #     bin="java",
4882        #     bin_type="bin",
4883        #     config=config,
4884        #     default_folder="/usr/bin",
4885        # )
4886        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4887        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4888        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4889
4890        # # Config - snpEff bin
4891        # snpeff_jar = get_bin(
4892        #     tool="snpeff",
4893        #     bin="snpEff.jar",
4894        #     bin_type="jar",
4895        #     config=config,
4896        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4897        # )
4898        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4899        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4900        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4901
4902        # Config - snpEff bin command
4903        snpeff_bin_command = get_bin_command(
4904            bin="snpEff.jar",
4905            tool="snpeff",
4906            bin_type="jar",
4907            config=config,
4908            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4909        )
4910        if not snpeff_bin_command:
4911            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4912            log.error(msg_err)
4913            raise ValueError(msg_err)
4914
4915        # Config - snpEff databases
4916        snpeff_databases = (
4917            config.get("folders", {})
4918            .get("databases", {})
4919            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4920        )
4921        snpeff_databases = full_path(snpeff_databases)
4922        if snpeff_databases is not None and snpeff_databases != "":
4923            log.debug(f"Create snpEff databases folder")
4924            if not os.path.exists(snpeff_databases):
4925                os.makedirs(snpeff_databases)
4926
4927        # Param
4928        param = self.get_param()
4929        log.debug("Param: " + str(param))
4930
4931        # Param
4932        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4933        log.debug("Options: " + str(options))
4934
4935        # Param - Assembly
4936        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4937
4938        # Param - Options
4939        snpeff_options = (
4940            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4941        )
4942        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4943        snpeff_csvstats = (
4944            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4945        )
4946        if snpeff_stats:
4947            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4948            snpeff_stats = full_path(snpeff_stats)
4949            snpeff_options += f" -stats {snpeff_stats}"
4950        if snpeff_csvstats:
4951            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4952            snpeff_csvstats = full_path(snpeff_csvstats)
4953            snpeff_options += f" -csvStats {snpeff_csvstats}"
4954
4955        # Data
4956        table_variants = self.get_table_variants()
4957
4958        # Check if not empty
4959        log.debug("Check if not empty")
4960        sql_query_chromosomes = (
4961            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4962        )
4963        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4964        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4965            log.info(f"VCF empty")
4966            return
4967
4968        # Export in VCF
4969        log.debug("Create initial file to annotate")
4970        tmp_vcf = NamedTemporaryFile(
4971            prefix=self.get_prefix(),
4972            dir=self.get_tmp_dir(),
4973            suffix=".vcf.gz",
4974            delete=True,
4975        )
4976        tmp_vcf_name = tmp_vcf.name
4977
4978        # VCF header
4979        vcf_reader = self.get_header()
4980        log.debug("Initial header: " + str(vcf_reader.infos))
4981
4982        # Existing annotations
4983        for vcf_annotation in self.get_header().infos:
4984
4985            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4986            log.debug(
4987                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4988            )
4989
4990        # Memory limit
4991        # if config.get("memory", None):
4992        #     memory_limit = config.get("memory", "8G")
4993        # else:
4994        #     memory_limit = "8G"
4995        memory_limit = self.get_memory("8G")
4996        log.debug(f"memory_limit: {memory_limit}")
4997
4998        # snpEff java options
4999        snpeff_java_options = (
5000            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
5001        )
5002        log.debug(f"Exomiser java options: {snpeff_java_options}")
5003
5004        force_update_annotation = True
5005
5006        if "ANN" not in self.get_header().infos or force_update_annotation:
5007
5008            # Check snpEff database
5009            log.debug(f"Check snpEff databases {[assembly]}")
5010            databases_download_snpeff(
5011                folder=snpeff_databases, assemblies=[assembly], config=config
5012            )
5013
5014            # Export VCF file
5015            self.export_variant_vcf(
5016                vcf_file=tmp_vcf_name,
5017                remove_info=True,
5018                add_samples=False,
5019                index=True,
5020            )
5021
5022            # Tmp file
5023            err_files = []
5024            tmp_annotate_vcf = NamedTemporaryFile(
5025                prefix=self.get_prefix(),
5026                dir=self.get_tmp_dir(),
5027                suffix=".vcf",
5028                delete=False,
5029            )
5030            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5031            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5032            err_files.append(tmp_annotate_vcf_name_err)
5033
5034            # Command
5035            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5036            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5037            run_parallel_commands([snpeff_command], 1)
5038
5039            # Error messages
5040            log.info(f"Error/Warning messages:")
5041            error_message_command_all = []
5042            error_message_command_warning = []
5043            error_message_command_err = []
5044            for err_file in err_files:
5045                with open(err_file, "r") as f:
5046                    for line in f:
5047                        message = line.strip()
5048                        error_message_command_all.append(message)
5049                        if line.startswith("[W::"):
5050                            error_message_command_warning.append(message)
5051                        if line.startswith("[E::"):
5052                            error_message_command_err.append(f"{err_file}: " + message)
5053            # log info
5054            for message in list(
5055                set(error_message_command_err + error_message_command_warning)
5056            ):
5057                log.info(f"   {message}")
5058            # debug info
5059            for message in list(set(error_message_command_all)):
5060                log.debug(f"   {message}")
5061            # failed
5062            if len(error_message_command_err):
5063                log.error("Annotation failed: Error in commands")
5064                raise ValueError("Annotation failed: Error in commands")
5065
5066            # Find annotation in header
5067            with open(tmp_annotate_vcf_name, "rt") as f:
5068                header_list = self.read_vcf_header(f)
5069            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5070
5071            for ann in annovar_vcf_header.infos:
5072                if ann not in self.get_header().infos:
5073                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5074
5075            # Update variants
5076            log.info(f"Annotation - Updating...")
5077            self.update_from_vcf(tmp_annotate_vcf_name)
5078
5079        else:
5080            if "ANN" in self.get_header().infos:
5081                log.debug(f"Existing snpEff annotations in VCF")
5082            if force_update_annotation:
5083                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5085    def annotation_annovar(self, threads: int = None) -> None:
5086        """
5087        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5088        annotations
5089
5090        :param threads: number of threads to use
5091        :return: the value of the variable "return_value".
5092        """
5093
5094        # DEBUG
5095        log.debug("Start annotation with Annovar databases")
5096
5097        # Threads
5098        if not threads:
5099            threads = self.get_threads()
5100        log.debug("Threads: " + str(threads))
5101
5102        # Tmp en Err files
5103        tmp_files = []
5104        err_files = []
5105
5106        # DEBUG
5107        delete_tmp = True
5108        if self.get_config().get("verbosity", "warning") in ["debug"]:
5109            delete_tmp = False
5110            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5111
5112        # Config
5113        config = self.get_config()
5114        log.debug("Config: " + str(config))
5115
5116        # Config - Folders - Databases
5117        databases_folders = (
5118            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5119        )
5120        log.debug("Databases annotations: " + str(databases_folders))
5121
5122        # Config - annovar bin command
5123        annovar_bin_command = get_bin_command(
5124            bin="table_annovar.pl",
5125            tool="annovar",
5126            bin_type="perl",
5127            config=config,
5128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5129        )
5130        if not annovar_bin_command:
5131            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5132            log.error(msg_err)
5133            raise ValueError(msg_err)
5134
5135        # Config - BCFTools bin command
5136        bcftools_bin_command = get_bin_command(
5137            bin="bcftools",
5138            tool="bcftools",
5139            bin_type="bin",
5140            config=config,
5141            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5142        )
5143        if not bcftools_bin_command:
5144            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5145            log.error(msg_err)
5146            raise ValueError(msg_err)
5147
5148        # Config - annovar databases
5149        annovar_databases = (
5150            config.get("folders", {})
5151            .get("databases", {})
5152            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5153        )
5154        annovar_databases = full_path(annovar_databases)
5155        if annovar_databases != "" and not os.path.exists(annovar_databases):
5156            os.makedirs(annovar_databases)
5157
5158        # Param
5159        param = self.get_param()
5160        log.debug("Param: " + str(param))
5161
5162        # Param - options
5163        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5164        log.debug("Options: " + str(options))
5165
5166        # Param - annotations
5167        annotations = (
5168            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5169        )
5170        log.debug("Annotations: " + str(annotations))
5171
5172        # Param - Assembly
5173        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5174
5175        # Annovar database assembly
5176        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5177        if annovar_databases_assembly != "" and not os.path.exists(
5178            annovar_databases_assembly
5179        ):
5180            os.makedirs(annovar_databases_assembly)
5181
5182        # Data
5183        table_variants = self.get_table_variants()
5184
5185        # Check if not empty
5186        log.debug("Check if not empty")
5187        sql_query_chromosomes = (
5188            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5189        )
5190        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5191        if not sql_query_chromosomes_df["count"][0]:
5192            log.info(f"VCF empty")
5193            return
5194
5195        # VCF header
5196        vcf_reader = self.get_header()
5197        log.debug("Initial header: " + str(vcf_reader.infos))
5198
5199        # Existing annotations
5200        for vcf_annotation in self.get_header().infos:
5201
5202            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5203            log.debug(
5204                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5205            )
5206
5207        force_update_annotation = True
5208
5209        if annotations:
5210
5211            commands = []
5212            tmp_annotates_vcf_name_list = []
5213
5214            # Export in VCF
5215            log.debug("Create initial file to annotate")
5216            tmp_vcf = NamedTemporaryFile(
5217                prefix=self.get_prefix(),
5218                dir=self.get_tmp_dir(),
5219                suffix=".vcf.gz",
5220                delete=False,
5221            )
5222            tmp_vcf_name = tmp_vcf.name
5223            tmp_files.append(tmp_vcf_name)
5224            tmp_files.append(tmp_vcf_name + ".tbi")
5225
5226            # Export VCF file
5227            self.export_variant_vcf(
5228                vcf_file=tmp_vcf_name,
5229                remove_info=".",
5230                add_samples=False,
5231                index=True,
5232            )
5233
5234            # Create file for field rename
5235            log.debug("Create file for field rename")
5236            tmp_rename = NamedTemporaryFile(
5237                prefix=self.get_prefix(),
5238                dir=self.get_tmp_dir(),
5239                suffix=".rename",
5240                delete=False,
5241            )
5242            tmp_rename_name = tmp_rename.name
5243            tmp_files.append(tmp_rename_name)
5244
5245            # Check Annovar database
5246            log.debug(
5247                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5248            )
5249            databases_download_annovar(
5250                folder=annovar_databases,
5251                files=list(annotations.keys()),
5252                assemblies=[assembly],
5253            )
5254
5255            for annotation in annotations:
5256                annotation_fields = annotations[annotation]
5257
5258                if not annotation_fields:
5259                    annotation_fields = {"INFO": None}
5260
5261                log.info(f"Annotations Annovar - database '{annotation}'")
5262                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5263
5264                # Tmp file for annovar
5265                err_files = []
5266                tmp_annotate_vcf_directory = TemporaryDirectory(
5267                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5268                )
5269                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5270                tmp_annotate_vcf_name_annovar = (
5271                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5272                )
5273                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5274                err_files.append(tmp_annotate_vcf_name_err)
5275                tmp_files.append(tmp_annotate_vcf_name_err)
5276
5277                # Tmp file final vcf annotated by annovar
5278                tmp_annotate_vcf = NamedTemporaryFile(
5279                    prefix=self.get_prefix(),
5280                    dir=self.get_tmp_dir(),
5281                    suffix=".vcf.gz",
5282                    delete=False,
5283                )
5284                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5285                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5286                tmp_files.append(tmp_annotate_vcf_name)
5287                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5288
5289                # Number of fields
5290                annotation_list = []
5291                annotation_renamed_list = []
5292
5293                for annotation_field in annotation_fields:
5294
5295                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5296                    annotation_fields_new_name = annotation_fields.get(
5297                        annotation_field, annotation_field
5298                    )
5299                    if not annotation_fields_new_name:
5300                        annotation_fields_new_name = annotation_field
5301
5302                    if (
5303                        force_update_annotation
5304                        or annotation_fields_new_name not in self.get_header().infos
5305                    ):
5306                        annotation_list.append(annotation_field)
5307                        annotation_renamed_list.append(annotation_fields_new_name)
5308                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5309                        log.warning(
5310                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5311                        )
5312
5313                    # Add rename info
5314                    run_parallel_commands(
5315                        [
5316                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5317                        ],
5318                        1,
5319                    )
5320
5321                # log.debug("fields_to_removed: " + str(fields_to_removed))
5322                log.debug("annotation_list: " + str(annotation_list))
5323
5324                # protocol
5325                protocol = annotation
5326
5327                # argument
5328                argument = ""
5329
5330                # operation
5331                operation = "f"
5332                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5333                    "ensGene"
5334                ):
5335                    operation = "g"
5336                    if options.get("genebase", None):
5337                        argument = f"""'{options.get("genebase","")}'"""
5338                elif annotation in ["cytoBand"]:
5339                    operation = "r"
5340
5341                # argument option
5342                argument_option = ""
5343                if argument != "":
5344                    argument_option = " --argument " + argument
5345
5346                # command options
5347                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5348                for option in options:
5349                    if option not in ["genebase"]:
5350                        command_options += f""" --{option}={options[option]}"""
5351
5352                # Command
5353
5354                # Command - Annovar
5355                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5356                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5357
5358                # Command - start pipe
5359                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5360
5361                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5362                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5363
5364                # Command - Special characters (refGene annotation)
5365                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5366
5367                # Command - Clean empty fields (with value ".")
5368                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5369
5370                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5371                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5372                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5373                    # for ann in annotation_renamed_list:
5374                    for ann in annotation_list:
5375                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5376
5377                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5378
5379                # Command - indexing
5380                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5381
5382                log.debug(f"Annotation - Annovar command: {command_annovar}")
5383                run_parallel_commands([command_annovar], 1)
5384
5385                # Error messages
5386                log.info(f"Error/Warning messages:")
5387                error_message_command_all = []
5388                error_message_command_warning = []
5389                error_message_command_err = []
5390                for err_file in err_files:
5391                    with open(err_file, "r") as f:
5392                        for line in f:
5393                            message = line.strip()
5394                            error_message_command_all.append(message)
5395                            if line.startswith("[W::") or line.startswith("WARNING"):
5396                                error_message_command_warning.append(message)
5397                            if line.startswith("[E::") or line.startswith("ERROR"):
5398                                error_message_command_err.append(
5399                                    f"{err_file}: " + message
5400                                )
5401                # log info
5402                for message in list(
5403                    set(error_message_command_err + error_message_command_warning)
5404                ):
5405                    log.info(f"   {message}")
5406                # debug info
5407                for message in list(set(error_message_command_all)):
5408                    log.debug(f"   {message}")
5409                # failed
5410                if len(error_message_command_err):
5411                    log.error("Annotation failed: Error in commands")
5412                    raise ValueError("Annotation failed: Error in commands")
5413
5414            if tmp_annotates_vcf_name_list:
5415
5416                # List of annotated files
5417                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5418
5419                # Tmp file
5420                tmp_annotate_vcf = NamedTemporaryFile(
5421                    prefix=self.get_prefix(),
5422                    dir=self.get_tmp_dir(),
5423                    suffix=".vcf.gz",
5424                    delete=False,
5425                )
5426                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5427                tmp_files.append(tmp_annotate_vcf_name)
5428                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5429                err_files.append(tmp_annotate_vcf_name_err)
5430                tmp_files.append(tmp_annotate_vcf_name_err)
5431
5432                # Command merge
5433                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5434                log.info(
5435                    f"Annotation Annovar - Annotation merging "
5436                    + str(len(tmp_annotates_vcf_name_list))
5437                    + " annotated files"
5438                )
5439                log.debug(f"Annotation - merge command: {merge_command}")
5440                run_parallel_commands([merge_command], 1)
5441
5442                # Find annotation in header
5443                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5444                    header_list = self.read_vcf_header(f)
5445                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5446
5447                for ann in annovar_vcf_header.infos:
5448                    if ann not in self.get_header().infos:
5449                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5450
5451                # Update variants
5452                log.info(f"Annotation Annovar - Updating...")
5453                self.update_from_vcf(tmp_annotate_vcf_name)
5454
5455            # Clean files
5456            # Tmp file remove command
5457            if True:
5458                tmp_files_remove_command = ""
5459                if tmp_files:
5460                    tmp_files_remove_command = " ".join(tmp_files)
5461                clean_command = f" rm -f {tmp_files_remove_command} "
5462                log.debug(f"Annotation Annovar - Annotation cleaning ")
5463                log.debug(f"Annotation - cleaning command: {clean_command}")
5464                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5467    def annotation_parquet(self, threads: int = None) -> None:
5468        """
5469        It takes a VCF file, and annotates it with a parquet file
5470
5471        :param threads: number of threads to use for the annotation
5472        :return: the value of the variable "result".
5473        """
5474
5475        # DEBUG
5476        log.debug("Start annotation with parquet databases")
5477
5478        # Threads
5479        if not threads:
5480            threads = self.get_threads()
5481        log.debug("Threads: " + str(threads))
5482
5483        # DEBUG
5484        delete_tmp = True
5485        if self.get_config().get("verbosity", "warning") in ["debug"]:
5486            delete_tmp = False
5487            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5488
5489        # Config
5490        databases_folders = set(
5491            self.get_config()
5492            .get("folders", {})
5493            .get("databases", {})
5494            .get("annotations", ["."])
5495            + self.get_config()
5496            .get("folders", {})
5497            .get("databases", {})
5498            .get("parquet", ["."])
5499        )
5500        log.debug("Databases annotations: " + str(databases_folders))
5501
5502        # Param
5503        annotations = (
5504            self.get_param()
5505            .get("annotation", {})
5506            .get("parquet", {})
5507            .get("annotations", None)
5508        )
5509        log.debug("Annotations: " + str(annotations))
5510
5511        # Assembly
5512        assembly = self.get_param().get(
5513            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5514        )
5515
5516        # Force Update Annotation
5517        force_update_annotation = (
5518            self.get_param()
5519            .get("annotation", {})
5520            .get("options", {})
5521            .get("annotations_update", False)
5522        )
5523        log.debug(f"force_update_annotation={force_update_annotation}")
5524        force_append_annotation = (
5525            self.get_param()
5526            .get("annotation", {})
5527            .get("options", {})
5528            .get("annotations_append", False)
5529        )
5530        log.debug(f"force_append_annotation={force_append_annotation}")
5531
5532        # Data
5533        table_variants = self.get_table_variants()
5534
5535        # Check if not empty
5536        log.debug("Check if not empty")
5537        sql_query_chromosomes_df = self.get_query_to_df(
5538            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5539        )
5540        if not sql_query_chromosomes_df["count"][0]:
5541            log.info(f"VCF empty")
5542            return
5543
5544        # VCF header
5545        vcf_reader = self.get_header()
5546        log.debug("Initial header: " + str(vcf_reader.infos))
5547
5548        # Nb Variants POS
5549        log.debug("NB Variants Start")
5550        nb_variants = self.conn.execute(
5551            f"SELECT count(*) AS count FROM variants"
5552        ).fetchdf()["count"][0]
5553        log.debug("NB Variants Stop")
5554
5555        # Existing annotations
5556        for vcf_annotation in self.get_header().infos:
5557
5558            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5559            log.debug(
5560                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5561            )
5562
5563        # Added columns
5564        added_columns = []
5565
5566        # drop indexes
5567        log.debug(f"Drop indexes...")
5568        self.drop_indexes()
5569
5570        if annotations:
5571
5572            if "ALL" in annotations:
5573
5574                all_param = annotations.get("ALL", {})
5575                all_param_formats = all_param.get("formats", None)
5576                all_param_releases = all_param.get("releases", None)
5577
5578                databases_infos_dict = self.scan_databases(
5579                    database_formats=all_param_formats,
5580                    database_releases=all_param_releases,
5581                )
5582                for database_infos in databases_infos_dict.keys():
5583                    if database_infos not in annotations:
5584                        annotations[database_infos] = {"INFO": None}
5585
5586            for annotation in annotations:
5587
5588                if annotation in ["ALL"]:
5589                    continue
5590
5591                # Annotation Name
5592                annotation_name = os.path.basename(annotation)
5593
5594                # Annotation fields
5595                annotation_fields = annotations[annotation]
5596                if not annotation_fields:
5597                    annotation_fields = {"INFO": None}
5598
5599                log.debug(f"Annotation '{annotation_name}'")
5600                log.debug(
5601                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5602                )
5603
5604                # Create Database
5605                database = Database(
5606                    database=annotation,
5607                    databases_folders=databases_folders,
5608                    assembly=assembly,
5609                )
5610
5611                # Find files
5612                parquet_file = database.get_database()
5613                parquet_hdr_file = database.get_header_file()
5614                parquet_type = database.get_type()
5615
5616                # Check if files exists
5617                if not parquet_file or not parquet_hdr_file:
5618                    log.error("Annotation failed: file not found")
5619                    raise ValueError("Annotation failed: file not found")
5620                else:
5621                    # Get parquet connexion
5622                    parquet_sql_attach = database.get_sql_database_attach(
5623                        output="query"
5624                    )
5625                    if parquet_sql_attach:
5626                        self.conn.execute(parquet_sql_attach)
5627                    parquet_file_link = database.get_sql_database_link()
5628                    # Log
5629                    log.debug(
5630                        f"Annotation '{annotation_name}' - file: "
5631                        + str(parquet_file)
5632                        + " and "
5633                        + str(parquet_hdr_file)
5634                    )
5635
5636                    # Database full header columns
5637                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5638                        parquet_hdr_file
5639                    )
5640                    # Log
5641                    log.debug(
5642                        "Annotation database header columns : "
5643                        + str(parquet_hdr_vcf_header_columns)
5644                    )
5645
5646                    # Load header as VCF object
5647                    parquet_hdr_vcf_header_infos = database.get_header().infos
5648                    # Log
5649                    log.debug(
5650                        "Annotation database header: "
5651                        + str(parquet_hdr_vcf_header_infos)
5652                    )
5653
5654                    # Get extra infos
5655                    parquet_columns = database.get_extra_columns()
5656                    # Log
5657                    log.debug("Annotation database Columns: " + str(parquet_columns))
5658
5659                    # Add extra columns if "ALL" in annotation_fields
5660                    # if "ALL" in annotation_fields:
5661                    #     allow_add_extra_column = True
5662                    if "ALL" in annotation_fields and database.get_extra_columns():
5663                        for extra_column in database.get_extra_columns():
5664                            if (
5665                                extra_column not in annotation_fields
5666                                and extra_column.replace("INFO/", "")
5667                                not in parquet_hdr_vcf_header_infos
5668                            ):
5669                                parquet_hdr_vcf_header_infos[extra_column] = (
5670                                    vcf.parser._Info(
5671                                        extra_column,
5672                                        ".",
5673                                        "String",
5674                                        f"{extra_column} description",
5675                                        "unknown",
5676                                        "unknown",
5677                                        self.code_type_map["String"],
5678                                    )
5679                                )
5680
5681                    # For all fields in database
5682                    annotation_fields_all = False
5683                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5684                        annotation_fields_all = True
5685                        annotation_fields = {
5686                            key: key for key in parquet_hdr_vcf_header_infos
5687                        }
5688
5689                        log.debug(
5690                            "Annotation database header - All annotations added: "
5691                            + str(annotation_fields)
5692                        )
5693
5694                    # Init
5695
5696                    # List of annotation fields to use
5697                    sql_query_annotation_update_info_sets = []
5698
5699                    # List of annotation to agregate
5700                    sql_query_annotation_to_agregate = []
5701
5702                    # Number of fields
5703                    nb_annotation_field = 0
5704
5705                    # Annotation fields processed
5706                    annotation_fields_processed = []
5707
5708                    # Columns mapping
5709                    map_columns = database.map_columns(
5710                        columns=annotation_fields, prefixes=["INFO/"]
5711                    )
5712
5713                    # Query dict for fields to remove (update option)
5714                    query_dict_remove = {}
5715
5716                    # Fetch Anotation fields
5717                    for annotation_field in annotation_fields:
5718
5719                        # annotation_field_column
5720                        annotation_field_column = map_columns.get(
5721                            annotation_field, "INFO"
5722                        )
5723
5724                        # field new name, if parametered
5725                        annotation_fields_new_name = annotation_fields.get(
5726                            annotation_field, annotation_field
5727                        )
5728                        if not annotation_fields_new_name:
5729                            annotation_fields_new_name = annotation_field
5730
5731                        # To annotate
5732                        # force_update_annotation = True
5733                        # force_append_annotation = True
5734                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5735                        if annotation_field in parquet_hdr_vcf_header_infos and (
5736                            force_update_annotation
5737                            or force_append_annotation
5738                            or (
5739                                annotation_fields_new_name
5740                                not in self.get_header().infos
5741                            )
5742                        ):
5743
5744                            # Add field to annotation to process list
5745                            annotation_fields_processed.append(
5746                                annotation_fields_new_name
5747                            )
5748
5749                            # explode infos for the field
5750                            annotation_fields_new_name_info_msg = ""
5751                            if (
5752                                force_update_annotation
5753                                and annotation_fields_new_name
5754                                in self.get_header().infos
5755                            ):
5756                                # Remove field from INFO
5757                                query = f"""
5758                                    UPDATE {table_variants} as table_variants
5759                                    SET INFO = REGEXP_REPLACE(
5760                                                concat(table_variants.INFO,''),
5761                                                ';*{annotation_fields_new_name}=[^;]*',
5762                                                ''
5763                                                )
5764                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5765                                """
5766                                annotation_fields_new_name_info_msg = " [update]"
5767                                query_dict_remove[
5768                                    f"remove 'INFO/{annotation_fields_new_name}'"
5769                                ] = query
5770
5771                            # Sep between fields in INFO
5772                            nb_annotation_field += 1
5773                            if nb_annotation_field > 1:
5774                                annotation_field_sep = ";"
5775                            else:
5776                                annotation_field_sep = ""
5777
5778                            log.info(
5779                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5780                            )
5781
5782                            # Add INFO field to header
5783                            parquet_hdr_vcf_header_infos_number = (
5784                                parquet_hdr_vcf_header_infos[annotation_field].num
5785                                or "."
5786                            )
5787                            parquet_hdr_vcf_header_infos_type = (
5788                                parquet_hdr_vcf_header_infos[annotation_field].type
5789                                or "String"
5790                            )
5791                            parquet_hdr_vcf_header_infos_description = (
5792                                parquet_hdr_vcf_header_infos[annotation_field].desc
5793                                or f"{annotation_field} description"
5794                            )
5795                            parquet_hdr_vcf_header_infos_source = (
5796                                parquet_hdr_vcf_header_infos[annotation_field].source
5797                                or "unknown"
5798                            )
5799                            parquet_hdr_vcf_header_infos_version = (
5800                                parquet_hdr_vcf_header_infos[annotation_field].version
5801                                or "unknown"
5802                            )
5803
5804                            vcf_reader.infos[annotation_fields_new_name] = (
5805                                vcf.parser._Info(
5806                                    annotation_fields_new_name,
5807                                    parquet_hdr_vcf_header_infos_number,
5808                                    parquet_hdr_vcf_header_infos_type,
5809                                    parquet_hdr_vcf_header_infos_description,
5810                                    parquet_hdr_vcf_header_infos_source,
5811                                    parquet_hdr_vcf_header_infos_version,
5812                                    self.code_type_map[
5813                                        parquet_hdr_vcf_header_infos_type
5814                                    ],
5815                                )
5816                            )
5817
5818                            # Append
5819                            if force_append_annotation:
5820                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5821                            else:
5822                                query_case_when_append = ""
5823
5824                            # Annotation/Update query fields
5825                            # Found in INFO column
5826                            if (
5827                                annotation_field_column == "INFO"
5828                                and "INFO" in parquet_hdr_vcf_header_columns
5829                            ):
5830                                sql_query_annotation_update_info_sets.append(
5831                                    f"""
5832                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5833                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5834                                        ELSE ''
5835                                    END
5836                                """
5837                                )
5838                            # Found in a specific column
5839                            else:
5840                                sql_query_annotation_update_info_sets.append(
5841                                    f"""
5842                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
5843                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
5844                                        ELSE ''
5845                                    END
5846                                """
5847                                )
5848                                sql_query_annotation_to_agregate.append(
5849                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5850                                )
5851
5852                        # Not to annotate
5853                        else:
5854
5855                            if force_update_annotation:
5856                                annotation_message = "forced"
5857                            else:
5858                                annotation_message = "skipped"
5859
5860                            if annotation_field not in parquet_hdr_vcf_header_infos:
5861                                log.warning(
5862                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5863                                )
5864                            if annotation_fields_new_name in self.get_header().infos:
5865                                log.warning(
5866                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5867                                )
5868
5869                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5870                    # allow_annotation_full_info = True
5871                    allow_annotation_full_info = not force_append_annotation
5872
5873                    if parquet_type in ["regions"]:
5874                        allow_annotation_full_info = False
5875
5876                    if (
5877                        allow_annotation_full_info
5878                        and nb_annotation_field == len(annotation_fields)
5879                        and annotation_fields_all
5880                        and (
5881                            "INFO" in parquet_hdr_vcf_header_columns
5882                            and "INFO" in database.get_extra_columns()
5883                        )
5884                    ):
5885                        log.debug("Column INFO annotation enabled")
5886                        sql_query_annotation_update_info_sets = []
5887                        sql_query_annotation_update_info_sets.append(
5888                            f" table_parquet.INFO "
5889                        )
5890
5891                    if sql_query_annotation_update_info_sets:
5892
5893                        # Annotate
5894                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5895
5896                        # Join query annotation update info sets for SQL
5897                        sql_query_annotation_update_info_sets_sql = ",".join(
5898                            sql_query_annotation_update_info_sets
5899                        )
5900
5901                        # Check chromosomes list (and variants infos)
5902                        sql_query_chromosomes = f"""
5903                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5904                            FROM {table_variants} as table_variants
5905                            GROUP BY table_variants."#CHROM"
5906                            ORDER BY table_variants."#CHROM"
5907                            """
5908                        sql_query_chromosomes_df = self.conn.execute(
5909                            sql_query_chromosomes
5910                        ).df()
5911                        sql_query_chromosomes_dict = {
5912                            entry["CHROM"]: {
5913                                "count": entry["count_variants"],
5914                                "min": entry["min_variants"],
5915                                "max": entry["max_variants"],
5916                            }
5917                            for index, entry in sql_query_chromosomes_df.iterrows()
5918                        }
5919
5920                        # Init
5921                        nb_of_query = 0
5922                        nb_of_variant_annotated = 0
5923                        query_dict = query_dict_remove
5924
5925                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5926                        for chrom in sql_query_chromosomes_dict:
5927
5928                            # Number of variant by chromosome
5929                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5930                                chrom, {}
5931                            ).get("count", 0)
5932
5933                            log.debug(
5934                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5935                            )
5936
5937                            # Annotation with regions database
5938                            if parquet_type in ["regions"]:
5939                                sql_query_annotation_from_clause = f"""
5940                                    FROM (
5941                                        SELECT 
5942                                            '{chrom}' AS \"#CHROM\",
5943                                            table_variants_from.\"POS\" AS \"POS\",
5944                                            {",".join(sql_query_annotation_to_agregate)}
5945                                        FROM {table_variants} as table_variants_from
5946                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5947                                            table_parquet_from."#CHROM" = '{chrom}'
5948                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5949                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5950                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5951                                                )
5952                                        )
5953                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5954                                        GROUP BY table_variants_from.\"POS\"
5955                                        )
5956                                        as table_parquet
5957                                """
5958
5959                                sql_query_annotation_where_clause = """
5960                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5961                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5962                                """
5963
5964                            # Annotation with variants database
5965                            else:
5966                                sql_query_annotation_from_clause = f"""
5967                                    FROM {parquet_file_link} as table_parquet
5968                                """
5969                                sql_query_annotation_where_clause = f"""
5970                                    table_variants."#CHROM" = '{chrom}'
5971                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5972                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5973                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5974                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5975                                """
5976
5977                            # Create update query
5978                            sql_query_annotation_chrom_interval_pos = f"""
5979                                UPDATE {table_variants} as table_variants
5980                                    SET INFO = 
5981                                        concat(
5982                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5983                                                THEN table_variants.INFO
5984                                                ELSE ''
5985                                            END
5986                                            ,
5987                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5988                                                        AND (
5989                                                        concat({sql_query_annotation_update_info_sets_sql})
5990                                                        )
5991                                                        NOT IN ('','.') 
5992                                                    THEN ';'
5993                                                    ELSE ''
5994                                            END
5995                                            ,
5996                                            {sql_query_annotation_update_info_sets_sql}
5997                                            )
5998                                    {sql_query_annotation_from_clause}
5999                                    WHERE {sql_query_annotation_where_clause}
6000                                    ;
6001                                """
6002
6003                            # Add update query to dict
6004                            query_dict[
6005                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6006                            ] = sql_query_annotation_chrom_interval_pos
6007
6008                        nb_of_query = len(query_dict)
6009                        num_query = 0
6010
6011                        # SET max_expression_depth TO x
6012                        self.conn.execute("SET max_expression_depth TO 10000")
6013
6014                        for query_name in query_dict:
6015                            query = query_dict[query_name]
6016                            num_query += 1
6017                            log.info(
6018                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6019                            )
6020                            result = self.conn.execute(query)
6021                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6022                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6023                            log.info(
6024                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6025                            )
6026
6027                        log.info(
6028                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6029                        )
6030
6031                    else:
6032
6033                        log.info(
6034                            f"Annotation '{annotation_name}' - No Annotations available"
6035                        )
6036
6037                    log.debug("Final header: " + str(vcf_reader.infos))
6038
6039        # Remove added columns
6040        for added_column in added_columns:
6041            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6043    def annotation_splice(self, threads: int = None) -> None:
6044        """
6045        This function annotate with snpEff
6046
6047        :param threads: The number of threads to use
6048        :return: the value of the variable "return_value".
6049        """
6050
6051        # DEBUG
6052        log.debug("Start annotation with splice tools")
6053
6054        # Threads
6055        if not threads:
6056            threads = self.get_threads()
6057        log.debug("Threads: " + str(threads))
6058
6059        # DEBUG
6060        delete_tmp = True
6061        if self.get_config().get("verbosity", "warning") in ["debug"]:
6062            delete_tmp = False
6063            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6064
6065        # Config
6066        config = self.get_config()
6067        log.debug("Config: " + str(config))
6068        splice_config = config.get("tools", {}).get("splice", {})
6069        if not splice_config:
6070            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6071        if not splice_config:
6072            msg_err = "No Splice tool config"
6073            log.error(msg_err)
6074            raise ValueError(msg_err)
6075        log.debug(f"splice_config={splice_config}")
6076
6077        # Config - Folders - Databases
6078        databases_folders = (
6079            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6080        )
6081        log.debug("Databases annotations: " + str(databases_folders))
6082
6083        # Splice docker image
6084        splice_docker_image = splice_config.get("docker").get("image")
6085
6086        # Pull splice image if it's not already there
6087        if not check_docker_image_exists(splice_docker_image):
6088            log.warning(
6089                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6090            )
6091            try:
6092                command(f"docker pull {splice_config.get('docker').get('image')}")
6093            except subprocess.CalledProcessError:
6094                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6095                log.error(msg_err)
6096                raise ValueError(msg_err)
6097                return None
6098
6099        # Config - splice databases
6100        splice_databases = (
6101            config.get("folders", {})
6102            .get("databases", {})
6103            .get("splice", DEFAULT_SPLICE_FOLDER)
6104        )
6105        splice_databases = full_path(splice_databases)
6106
6107        # Param
6108        param = self.get_param()
6109        log.debug("Param: " + str(param))
6110
6111        # Param
6112        options = param.get("annotation", {}).get("splice", {})
6113        log.debug("Options: " + str(options))
6114
6115        # Data
6116        table_variants = self.get_table_variants()
6117
6118        # Check if not empty
6119        log.debug("Check if not empty")
6120        sql_query_chromosomes = (
6121            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6122        )
6123        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6124            log.info("VCF empty")
6125            return None
6126
6127        # Export in VCF
6128        log.debug("Create initial file to annotate")
6129
6130        # Create output folder
6131        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6132        if not os.path.exists(output_folder):
6133            Path(output_folder).mkdir(parents=True, exist_ok=True)
6134
6135        # Create tmp VCF file
6136        tmp_vcf = NamedTemporaryFile(
6137            prefix=self.get_prefix(),
6138            dir=output_folder,
6139            suffix=".vcf",
6140            delete=False,
6141        )
6142        tmp_vcf_name = tmp_vcf.name
6143
6144        # VCF header
6145        header = self.get_header()
6146
6147        # Existing annotations
6148        for vcf_annotation in self.get_header().infos:
6149
6150            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6151            log.debug(
6152                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6153            )
6154
6155        # Memory limit
6156        if config.get("memory", None):
6157            memory_limit = config.get("memory", "8G").upper()
6158            # upper()
6159        else:
6160            memory_limit = "8G"
6161        log.debug(f"memory_limit: {memory_limit}")
6162
6163        # Check number of variants to annotate
6164        where_clause_regex_spliceai = r"SpliceAI_\w+"
6165        where_clause_regex_spip = r"SPiP_\w+"
6166        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6167        df_list_of_variants_to_annotate = self.get_query_to_df(
6168            query=f""" SELECT * FROM variants {where_clause} """
6169        )
6170        if len(df_list_of_variants_to_annotate) == 0:
6171            log.warning(
6172                f"No variants to annotate with splice. Variants probably already annotated with splice"
6173            )
6174            return None
6175        else:
6176            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6177
6178        # Export VCF file
6179        self.export_variant_vcf(
6180            vcf_file=tmp_vcf_name,
6181            remove_info=True,
6182            add_samples=True,
6183            index=False,
6184            where_clause=where_clause,
6185        )
6186
6187        # Create docker container and launch splice analysis
6188        if splice_config:
6189
6190            # Splice mount folders
6191            mount_folders = splice_config.get("mount", {})
6192
6193            # Genome mount
6194            mount_folders[
6195                config.get("folders", {})
6196                .get("databases", {})
6197                .get("genomes", DEFAULT_GENOME_FOLDER)
6198            ] = "ro"
6199
6200            # SpliceAI mount
6201            mount_folders[
6202                config.get("folders", {})
6203                .get("databases", {})
6204                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6205            ] = "ro"
6206
6207            # Genome mount
6208            mount_folders[
6209                config.get("folders", {})
6210                .get("databases", {})
6211                .get("spip", DEFAULT_SPIP_FOLDER)
6212            ] = "ro"
6213
6214            # Mount folders
6215            mount = []
6216
6217            # Config mount
6218            mount = [
6219                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6220                for path, mode in mount_folders.items()
6221            ]
6222
6223            if any(value for value in splice_config.values() if value is None):
6224                log.warning("At least one splice config parameter is empty")
6225                return None
6226
6227            # Params in splice nf
6228            def check_values(dico: dict):
6229                """
6230                Ensure parameters for NF splice pipeline
6231                """
6232                for key, val in dico.items():
6233                    if key == "genome":
6234                        if any(
6235                            assemb in options.get("genome", {})
6236                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6237                        ):
6238                            yield f"--{key} hg19"
6239                        elif any(
6240                            assemb in options.get("genome", {})
6241                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6242                        ):
6243                            yield f"--{key} hg38"
6244                    elif (
6245                        (isinstance(val, str) and val)
6246                        or isinstance(val, int)
6247                        or isinstance(val, bool)
6248                    ):
6249                        yield f"--{key} {val}"
6250
6251            # Genome
6252            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6253            options["genome"] = genome
6254
6255            # NF params
6256            nf_params = []
6257
6258            # Add options
6259            if options:
6260                nf_params = list(check_values(options))
6261                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6262            else:
6263                log.debug("No NF params provided")
6264
6265            # Add threads
6266            if "threads" not in options.keys():
6267                nf_params.append(f"--threads {threads}")
6268
6269            # Genome path
6270            genome_path = find_genome(
6271                config.get("folders", {})
6272                .get("databases", {})
6273                .get("genomes", DEFAULT_GENOME_FOLDER),
6274                file=f"{genome}.fa",
6275            )
6276            # Add genome path
6277            if not genome_path:
6278                raise ValueError(
6279                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6280                )
6281            else:
6282                log.debug(f"Genome: {genome_path}")
6283                nf_params.append(f"--genome_path {genome_path}")
6284
6285            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6286                """
6287                Setting up updated databases for SPiP and SpliceAI
6288                """
6289
6290                try:
6291
6292                    # SpliceAI assembly transcriptome
6293                    spliceai_assembly = os.path.join(
6294                        config.get("folders", {})
6295                        .get("databases", {})
6296                        .get("spliceai", {}),
6297                        options.get("genome"),
6298                        "transcriptome",
6299                    )
6300                    spip_assembly = options.get("genome")
6301
6302                    spip = find(
6303                        f"transcriptome_{spip_assembly}.RData",
6304                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6305                    )
6306                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6307                    log.debug(f"SPiP annotations: {spip}")
6308                    log.debug(f"SpliceAI annotations: {spliceai}")
6309                    if spip and spliceai:
6310                        return [
6311                            f"--spip_transcriptome {spip}",
6312                            f"--spliceai_annotations {spliceai}",
6313                        ]
6314                    else:
6315                        # TODO crash and go on with basic annotations ?
6316                        # raise ValueError(
6317                        #     "Can't find splice databases in configuration EXIT"
6318                        # )
6319                        log.warning(
6320                            "Can't find splice databases in configuration, use annotations file from image"
6321                        )
6322                except TypeError:
6323                    log.warning(
6324                        "Can't find splice databases in configuration, use annotations file from image"
6325                    )
6326                    return []
6327
6328            # Add options, check if transcriptome option have already beend provided
6329            if (
6330                "spip_transcriptome" not in nf_params
6331                and "spliceai_transcriptome" not in nf_params
6332            ):
6333                splice_reference = splice_annotations(options, config)
6334                if splice_reference:
6335                    nf_params.extend(splice_reference)
6336
6337            nf_params.append(f"--output_folder {output_folder}")
6338
6339            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6340            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6341            log.debug(cmd)
6342
6343            splice_config["docker"]["command"] = cmd
6344
6345            docker_cmd = get_bin_command(
6346                tool="splice",
6347                bin_type="docker",
6348                config=config,
6349                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6350                add_options=f"--name {random_uuid} {' '.join(mount)}",
6351            )
6352
6353            # Docker debug
6354            # if splice_config.get("rm_container"):
6355            #     rm_container = "--rm"
6356            # else:
6357            #     rm_container = ""
6358            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6359
6360            log.debug(docker_cmd)
6361            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6362            log.debug(res.stdout)
6363            if res.stderr:
6364                log.error(res.stderr)
6365            res.check_returncode()
6366        else:
6367            log.warning(f"Splice tool configuration not found: {config}")
6368
6369        # Update variants
6370        log.info("Annotation - Updating...")
6371        # Test find output vcf
6372        log.debug(
6373            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6374        )
6375        output_vcf = []
6376        # Wrong folder to look in
6377        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6378            if (
6379                files
6380                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6381            ):
6382                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6383        # log.debug(os.listdir(options.get("output_folder")))
6384        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6385        if not output_vcf:
6386            log.debug(
6387                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6388            )
6389        else:
6390            # Get new header from annotated vcf
6391            log.debug(f"Initial header: {len(header.infos)} fields")
6392            # Create new header with splice infos
6393            new_vcf = Variants(input=output_vcf[0])
6394            new_vcf_header = new_vcf.get_header().infos
6395            for keys, infos in new_vcf_header.items():
6396                if keys not in header.infos.keys():
6397                    header.infos[keys] = infos
6398            log.debug(f"New header: {len(header.infos)} fields")
6399            log.debug(f"Splice tmp output: {output_vcf[0]}")
6400            self.update_from_vcf(output_vcf[0])
6401
6402        # Remove folder
6403        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6409    def get_config_default(self, name: str) -> dict:
6410        """
6411        The function `get_config_default` returns a dictionary containing default configurations for
6412        various calculations and prioritizations.
6413
6414        :param name: The `get_config_default` function returns a dictionary containing default
6415        configurations for different calculations and prioritizations. The `name` parameter is used to
6416        specify which specific configuration to retrieve from the dictionary
6417        :type name: str
6418        :return: The function `get_config_default` returns a dictionary containing default configuration
6419        settings for different calculations and prioritizations. The specific configuration settings are
6420        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6421        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6422        returned. If there is no match, an empty dictionary is returned.
6423        """
6424
6425        config_default = {
6426            "calculations": {
6427                "variant_chr_pos_alt_ref": {
6428                    "type": "sql",
6429                    "name": "variant_chr_pos_alt_ref",
6430                    "description": "Create a variant ID with chromosome, position, alt and ref",
6431                    "available": False,
6432                    "output_column_name": "variant_chr_pos_alt_ref",
6433                    "output_column_type": "String",
6434                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6435                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6436                    "operation_info": True,
6437                },
6438                "VARTYPE": {
6439                    "type": "sql",
6440                    "name": "VARTYPE",
6441                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6442                    "available": True,
6443                    "output_column_name": "VARTYPE",
6444                    "output_column_type": "String",
6445                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6446                    "operation_query": """
6447                            CASE
6448                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6449                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6450                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6451                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6452                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6453                                ELSE 'UNDEFINED'
6454                            END
6455                            """,
6456                    "info_fields": ["SVTYPE"],
6457                    "operation_info": True,
6458                },
6459                "snpeff_hgvs": {
6460                    "type": "python",
6461                    "name": "snpeff_hgvs",
6462                    "description": "HGVS nomenclatures from snpEff annotation",
6463                    "available": True,
6464                    "function_name": "calculation_extract_snpeff_hgvs",
6465                    "function_params": ["snpeff_hgvs", "ANN"],
6466                },
6467                "snpeff_ann_explode": {
6468                    "type": "python",
6469                    "name": "snpeff_ann_explode",
6470                    "description": "Explode snpEff annotations with uniquify values",
6471                    "available": True,
6472                    "function_name": "calculation_snpeff_ann_explode",
6473                    "function_params": [False, "fields", "snpeff_", "ANN"],
6474                },
6475                "snpeff_ann_explode_uniquify": {
6476                    "type": "python",
6477                    "name": "snpeff_ann_explode_uniquify",
6478                    "description": "Explode snpEff annotations",
6479                    "available": True,
6480                    "function_name": "calculation_snpeff_ann_explode",
6481                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6482                },
6483                "snpeff_ann_explode_json": {
6484                    "type": "python",
6485                    "name": "snpeff_ann_explode_json",
6486                    "description": "Explode snpEff annotations in JSON format",
6487                    "available": True,
6488                    "function_name": "calculation_snpeff_ann_explode",
6489                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6490                },
6491                "NOMEN": {
6492                    "type": "python",
6493                    "name": "NOMEN",
6494                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6495                    "available": True,
6496                    "function_name": "calculation_extract_nomen",
6497                    "function_params": [],
6498                },
6499                "FINDBYPIPELINE": {
6500                    "type": "python",
6501                    "name": "FINDBYPIPELINE",
6502                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6503                    "available": True,
6504                    "function_name": "calculation_find_by_pipeline",
6505                    "function_params": ["findbypipeline"],
6506                },
6507                "FINDBYSAMPLE": {
6508                    "type": "python",
6509                    "name": "FINDBYSAMPLE",
6510                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6511                    "available": True,
6512                    "function_name": "calculation_find_by_pipeline",
6513                    "function_params": ["findbysample"],
6514                },
6515                "GENOTYPECONCORDANCE": {
6516                    "type": "python",
6517                    "name": "GENOTYPECONCORDANCE",
6518                    "description": "Concordance of genotype for multi caller VCF",
6519                    "available": True,
6520                    "function_name": "calculation_genotype_concordance",
6521                    "function_params": [],
6522                },
6523                "BARCODE": {
6524                    "type": "python",
6525                    "name": "BARCODE",
6526                    "description": "BARCODE as VaRank tool",
6527                    "available": True,
6528                    "function_name": "calculation_barcode",
6529                    "function_params": [],
6530                },
6531                "BARCODEFAMILY": {
6532                    "type": "python",
6533                    "name": "BARCODEFAMILY",
6534                    "description": "BARCODEFAMILY as VaRank tool",
6535                    "available": True,
6536                    "function_name": "calculation_barcode_family",
6537                    "function_params": ["BCF"],
6538                },
6539                "TRIO": {
6540                    "type": "python",
6541                    "name": "TRIO",
6542                    "description": "Inheritance for a trio family",
6543                    "available": True,
6544                    "function_name": "calculation_trio",
6545                    "function_params": [],
6546                },
6547                "VAF": {
6548                    "type": "python",
6549                    "name": "VAF",
6550                    "description": "Variant Allele Frequency (VAF) harmonization",
6551                    "available": True,
6552                    "function_name": "calculation_vaf_normalization",
6553                    "function_params": [],
6554                },
6555                "VAF_stats": {
6556                    "type": "python",
6557                    "name": "VAF_stats",
6558                    "description": "Variant Allele Frequency (VAF) statistics",
6559                    "available": True,
6560                    "function_name": "calculation_genotype_stats",
6561                    "function_params": ["VAF"],
6562                },
6563                "DP_stats": {
6564                    "type": "python",
6565                    "name": "DP_stats",
6566                    "description": "Depth (DP) statistics",
6567                    "available": True,
6568                    "function_name": "calculation_genotype_stats",
6569                    "function_params": ["DP"],
6570                },
6571                "variant_id": {
6572                    "type": "python",
6573                    "name": "variant_id",
6574                    "description": "Variant ID generated from variant position and type",
6575                    "available": True,
6576                    "function_name": "calculation_variant_id",
6577                    "function_params": [],
6578                },
6579                "transcripts_json": {
6580                    "type": "python",
6581                    "name": "transcripts_json",
6582                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6583                    "available": True,
6584                    "function_name": "calculation_transcripts_annotation",
6585                    "function_params": ["transcripts_json", None],
6586                },
6587                "transcripts_ann": {
6588                    "type": "python",
6589                    "name": "transcripts_ann",
6590                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6591                    "available": True,
6592                    "function_name": "calculation_transcripts_annotation",
6593                    "function_params": [None, "transcripts_ann"],
6594                },
6595                "transcripts_annotations": {
6596                    "type": "python",
6597                    "name": "transcripts_annotations",
6598                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6599                    "available": True,
6600                    "function_name": "calculation_transcripts_annotation",
6601                    "function_params": [None, None],
6602                },
6603                "transcripts_prioritization": {
6604                    "type": "python",
6605                    "name": "transcripts_prioritization",
6606                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6607                    "available": True,
6608                    "function_name": "calculation_transcripts_prioritization",
6609                    "function_params": [],
6610                },
6611            },
6612            "prioritizations": {
6613                "default": {
6614                    "ANN2": [
6615                        {
6616                            "type": "contains",
6617                            "value": "HIGH",
6618                            "score": 5,
6619                            "flag": "PASS",
6620                            "comment": [
6621                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6622                            ],
6623                        },
6624                        {
6625                            "type": "contains",
6626                            "value": "MODERATE",
6627                            "score": 3,
6628                            "flag": "PASS",
6629                            "comment": [
6630                                "A non-disruptive variant that might change protein effectiveness"
6631                            ],
6632                        },
6633                        {
6634                            "type": "contains",
6635                            "value": "LOW",
6636                            "score": 0,
6637                            "flag": "FILTERED",
6638                            "comment": [
6639                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6640                            ],
6641                        },
6642                        {
6643                            "type": "contains",
6644                            "value": "MODIFIER",
6645                            "score": 0,
6646                            "flag": "FILTERED",
6647                            "comment": [
6648                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6649                            ],
6650                        },
6651                    ],
6652                }
6653            },
6654        }
6655
6656        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6658    def get_config_json(
6659        self, name: str, config_dict: dict = {}, config_file: str = None
6660    ) -> dict:
6661        """
6662        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6663        default values, a dictionary, and a file.
6664
6665        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6666        the name of the configuration. It is used to identify and retrieve the configuration settings
6667        for a specific component or module
6668        :type name: str
6669        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6670        dictionary that allows you to provide additional configuration settings or overrides. When you
6671        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6672        the key is the configuration setting you want to override or
6673        :type config_dict: dict
6674        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6675        specify the path to a configuration file that contains additional settings. If provided, the
6676        function will read the contents of this file and update the configuration dictionary with the
6677        values found in the file, overriding any existing values with the
6678        :type config_file: str
6679        :return: The function `get_config_json` returns a dictionary containing the configuration
6680        settings.
6681        """
6682
6683        # Create with default prioritizations
6684        config_default = self.get_config_default(name=name)
6685        configuration = config_default
6686        # log.debug(f"configuration={configuration}")
6687
6688        # Replace prioritizations from dict
6689        for config in config_dict:
6690            configuration[config] = config_dict[config]
6691
6692        # Replace prioritizations from file
6693        config_file = full_path(config_file)
6694        if config_file:
6695            if os.path.exists(config_file):
6696                with open(config_file) as config_file_content:
6697                    config_file_dict = json.load(config_file_content)
6698                for config in config_file_dict:
6699                    configuration[config] = config_file_dict[config]
6700            else:
6701                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6702                log.error(msg_error)
6703                raise ValueError(msg_error)
6704
6705        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
6707    def prioritization(
6708        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
6709    ) -> bool:
6710        """
6711        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
6712        prioritizes variants based on configured profiles and criteria.
6713
6714        :param table: The `table` parameter in the `prioritization` function is used to specify the name
6715        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
6716        a table name is provided, the method will prioritize the variants in that specific table
6717        :type table: str
6718        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
6719        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
6720        provided, the code will use a default prefix value of "PZ"
6721        :type pz_prefix: str
6722        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
6723        additional parameters specific to the prioritization process. These parameters can include
6724        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
6725        configurations needed for the prioritization of variants in a V
6726        :type pz_param: dict
6727        :return: A boolean value (True) is being returned from the `prioritization` function.
6728        """
6729
6730        # Config
6731        config = self.get_config()
6732
6733        # Param
6734        param = self.get_param()
6735
6736        # Prioritization param
6737        if pz_param is not None:
6738            prioritization_param = pz_param
6739        else:
6740            prioritization_param = param.get("prioritization", {})
6741
6742        # Configuration profiles
6743        prioritization_config_file = prioritization_param.get(
6744            "prioritization_config", None
6745        )
6746        prioritization_config_file = full_path(prioritization_config_file)
6747        prioritizations_config = self.get_config_json(
6748            name="prioritizations", config_file=prioritization_config_file
6749        )
6750
6751        # Prioritization prefix
6752        pz_prefix_default = "PZ"
6753        if pz_prefix is None:
6754            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
6755
6756        # Prioritization options
6757        profiles = prioritization_param.get("profiles", [])
6758        if isinstance(profiles, str):
6759            profiles = profiles.split(",")
6760        pzfields = prioritization_param.get(
6761            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
6762        )
6763        if isinstance(pzfields, str):
6764            pzfields = pzfields.split(",")
6765        default_profile = prioritization_param.get("default_profile", None)
6766        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
6767        prioritization_score_mode = prioritization_param.get(
6768            "prioritization_score_mode", "HOWARD"
6769        )
6770
6771        # Quick Prioritizations
6772        prioritizations = param.get("prioritizations", None)
6773        if prioritizations:
6774            log.info("Quick Prioritization:")
6775            for profile in prioritizations.split(","):
6776                if profile not in profiles:
6777                    profiles.append(profile)
6778                    log.info(f"   {profile}")
6779
6780        # If profile "ALL" provided, all profiles in the config profiles
6781        if "ALL" in profiles:
6782            profiles = list(prioritizations_config.keys())
6783
6784        for profile in profiles:
6785            if prioritizations_config.get(profile, None):
6786                log.debug(f"Profile '{profile}' configured")
6787            else:
6788                msg_error = f"Profile '{profile}' NOT configured"
6789                log.error(msg_error)
6790                raise ValueError(msg_error)
6791
6792        if profiles:
6793            log.info(f"Prioritization... ")
6794        else:
6795            log.debug(f"No profile defined")
6796            return False
6797
6798        if not default_profile and len(profiles):
6799            default_profile = profiles[0]
6800
6801        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6802        log.debug("Profiles to check: " + str(list(profiles)))
6803
6804        # Variables
6805        if table is not None:
6806            table_variants = table
6807        else:
6808            table_variants = self.get_table_variants(clause="update")
6809        log.debug(f"Table to prioritize: {table_variants}")
6810
6811        # Added columns
6812        added_columns = []
6813
6814        # Create list of PZfields
6815        # List of PZFields
6816        list_of_pzfields_original = pzfields + [
6817            pzfield + pzfields_sep + profile
6818            for pzfield in pzfields
6819            for profile in profiles
6820        ]
6821        list_of_pzfields = []
6822        log.debug(f"{list_of_pzfields_original}")
6823
6824        # Remove existing PZfields to use if exists
6825        for pzfield in list_of_pzfields_original:
6826            if self.get_header().infos.get(pzfield, None) is None:
6827                list_of_pzfields.append(pzfield)
6828                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6829            else:
6830                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6831
6832        if list_of_pzfields:
6833
6834            # Explode Infos prefix
6835            explode_infos_prefix = self.get_explode_infos_prefix()
6836
6837            # PZfields tags description
6838            PZfields_INFOS = {
6839                f"{pz_prefix}Tags": {
6840                    "ID": f"{pz_prefix}Tags",
6841                    "Number": ".",
6842                    "Type": "String",
6843                    "Description": "Variant tags based on annotation criteria",
6844                },
6845                f"{pz_prefix}Score": {
6846                    "ID": f"{pz_prefix}Score",
6847                    "Number": 1,
6848                    "Type": "Integer",
6849                    "Description": "Variant score based on annotation criteria",
6850                },
6851                f"{pz_prefix}Flag": {
6852                    "ID": f"{pz_prefix}Flag",
6853                    "Number": 1,
6854                    "Type": "String",
6855                    "Description": "Variant flag based on annotation criteria",
6856                },
6857                f"{pz_prefix}Comment": {
6858                    "ID": f"{pz_prefix}Comment",
6859                    "Number": ".",
6860                    "Type": "String",
6861                    "Description": "Variant comment based on annotation criteria",
6862                },
6863                f"{pz_prefix}Infos": {
6864                    "ID": f"{pz_prefix}Infos",
6865                    "Number": ".",
6866                    "Type": "String",
6867                    "Description": "Variant infos based on annotation criteria",
6868                },
6869                f"{pz_prefix}Class": {
6870                    "ID": f"{pz_prefix}Class",
6871                    "Number": ".",
6872                    "Type": "String",
6873                    "Description": "Variant class based on annotation criteria",
6874                },
6875            }
6876
6877            # Create INFO fields if not exist
6878            for field in PZfields_INFOS:
6879                field_ID = PZfields_INFOS[field]["ID"]
6880                field_description = PZfields_INFOS[field]["Description"]
6881                if field_ID not in self.get_header().infos and field_ID in pzfields:
6882                    field_description = (
6883                        PZfields_INFOS[field]["Description"]
6884                        + f", profile {default_profile}"
6885                    )
6886                    self.get_header().infos[field_ID] = vcf.parser._Info(
6887                        field_ID,
6888                        PZfields_INFOS[field]["Number"],
6889                        PZfields_INFOS[field]["Type"],
6890                        field_description,
6891                        "unknown",
6892                        "unknown",
6893                        code_type_map[PZfields_INFOS[field]["Type"]],
6894                    )
6895
6896            # Create INFO fields if not exist for each profile
6897            for profile in prioritizations_config:
6898                if profile in profiles or profiles == []:
6899                    for field in PZfields_INFOS:
6900                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6901                        field_description = (
6902                            PZfields_INFOS[field]["Description"]
6903                            + f", profile {profile}"
6904                        )
6905                        if (
6906                            field_ID not in self.get_header().infos
6907                            and field in pzfields
6908                        ):
6909                            self.get_header().infos[field_ID] = vcf.parser._Info(
6910                                field_ID,
6911                                PZfields_INFOS[field]["Number"],
6912                                PZfields_INFOS[field]["Type"],
6913                                field_description,
6914                                "unknown",
6915                                "unknown",
6916                                code_type_map[PZfields_INFOS[field]["Type"]],
6917                            )
6918
6919            # Header
6920            for pzfield in list_of_pzfields:
6921                if re.match(f"{pz_prefix}Score.*", pzfield):
6922                    added_column = self.add_column(
6923                        table_name=table_variants,
6924                        column_name=pzfield,
6925                        column_type="INTEGER",
6926                        default_value="0",
6927                    )
6928                elif re.match(f"{pz_prefix}Flag.*", pzfield):
6929                    added_column = self.add_column(
6930                        table_name=table_variants,
6931                        column_name=pzfield,
6932                        column_type="BOOLEAN",
6933                        default_value="1",
6934                    )
6935                elif re.match(f"{pz_prefix}Class.*", pzfield):
6936                    added_column = self.add_column(
6937                        table_name=table_variants,
6938                        column_name=pzfield,
6939                        column_type="VARCHAR[]",
6940                        default_value="null",
6941                    )
6942                else:
6943                    added_column = self.add_column(
6944                        table_name=table_variants,
6945                        column_name=pzfield,
6946                        column_type="STRING",
6947                        default_value="''",
6948                    )
6949                added_columns.append(added_column)
6950
6951            # Profiles
6952            if profiles:
6953
6954                # foreach profile in configuration file
6955                for profile in prioritizations_config:
6956
6957                    # If profile is asked in param, or ALL are asked (empty profile [])
6958                    if profile in profiles or profiles == []:
6959                        log.info(f"Profile '{profile}'")
6960
6961                        sql_set_info_option = ""
6962
6963                        sql_set_info = []
6964
6965                        # PZ fields set
6966
6967                        # PZScore
6968                        if (
6969                            f"{pz_prefix}Score{pzfields_sep}{profile}"
6970                            in list_of_pzfields
6971                        ):
6972                            sql_set_info.append(
6973                                f"""
6974                                    concat(
6975                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
6976                                        {pz_prefix}Score{pzfields_sep}{profile}
6977                                    ) 
6978                                """
6979                            )
6980                            if (
6981                                profile == default_profile
6982                                and f"{pz_prefix}Score" in list_of_pzfields
6983                            ):
6984                                sql_set_info.append(
6985                                    f"""
6986                                        concat(
6987                                            '{pz_prefix}Score=',
6988                                            {pz_prefix}Score{pzfields_sep}{profile}
6989                                        )
6990                                    """
6991                                )
6992
6993                        # PZFlag
6994                        if (
6995                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
6996                            in list_of_pzfields
6997                        ):
6998                            sql_set_info.append(
6999                                f"""
7000                                    concat(
7001                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
7002                                        CASE 
7003                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7004                                            THEN 'PASS'
7005                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7006                                            THEN 'FILTERED'
7007                                        END
7008                                    ) 
7009                                """
7010                            )
7011                            if (
7012                                profile == default_profile
7013                                and f"{pz_prefix}Flag" in list_of_pzfields
7014                            ):
7015                                sql_set_info.append(
7016                                    f"""
7017                                        concat(
7018                                            '{pz_prefix}Flag=',
7019                                            CASE 
7020                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7021                                                THEN 'PASS'
7022                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7023                                                THEN 'FILTERED'
7024                                            END
7025                                        )
7026                                    """
7027                                )
7028
7029                        # PZClass
7030                        if (
7031                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7032                            in list_of_pzfields
7033                        ):
7034                            sql_set_info.append(
7035                                f"""
7036                                    concat(
7037                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7038                                        CASE
7039                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7040                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7041                                            ELSE '.'
7042                                        END 
7043                                    )
7044                                    
7045                                """
7046                            )
7047                            if (
7048                                profile == default_profile
7049                                and f"{pz_prefix}Class" in list_of_pzfields
7050                            ):
7051                                sql_set_info.append(
7052                                    f"""
7053                                        concat(
7054                                            '{pz_prefix}Class=',
7055                                            CASE
7056                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7057                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7058                                                ELSE '.'
7059                                            END 
7060                                        )
7061                                    """
7062                                )
7063
7064                        # PZComment
7065                        if (
7066                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7067                            in list_of_pzfields
7068                        ):
7069                            sql_set_info.append(
7070                                f"""
7071                                    CASE
7072                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7073                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7074                                        ELSE ''
7075                                    END
7076                                """
7077                            )
7078                            if (
7079                                profile == default_profile
7080                                and f"{pz_prefix}Comment" in list_of_pzfields
7081                            ):
7082                                sql_set_info.append(
7083                                    f"""
7084                                        CASE
7085                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7086                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7087                                            ELSE ''
7088                                        END
7089                                    """
7090                                )
7091
7092                        # PZInfos
7093                        if (
7094                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7095                            in list_of_pzfields
7096                        ):
7097                            sql_set_info.append(
7098                                f"""
7099                                    CASE
7100                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7101                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7102                                        ELSE ''
7103                                    END
7104                                """
7105                            )
7106                            if (
7107                                profile == default_profile
7108                                and f"{pz_prefix}Infos" in list_of_pzfields
7109                            ):
7110                                sql_set_info.append(
7111                                    f"""
7112                                        CASE
7113                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7114                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7115                                            ELSE ''
7116                                        END
7117                                    """
7118                                )
7119
7120                        # Merge PZfields
7121                        sql_set_info_option = ""
7122                        sql_set_sep = ""
7123                        for sql_set in sql_set_info:
7124                            if sql_set_sep:
7125                                sql_set_info_option += f"""
7126                                    , concat('{sql_set_sep}', {sql_set})
7127                                """
7128                            else:
7129                                sql_set_info_option += f"""
7130                                    , {sql_set}
7131                                """
7132                            sql_set_sep = ";"
7133
7134                        sql_queries = []
7135                        for annotation in prioritizations_config[profile]:
7136
7137                            # skip special sections
7138                            if annotation.startswith("_"):
7139                                continue
7140
7141                            # For each criterions
7142                            for criterion in prioritizations_config[profile][
7143                                annotation
7144                            ]:
7145
7146                                # Criterion mode
7147                                criterion_mode = None
7148                                if np.any(
7149                                    np.isin(list(criterion.keys()), ["type", "value"])
7150                                ):
7151                                    criterion_mode = "operation"
7152                                elif np.any(
7153                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7154                                ):
7155                                    criterion_mode = "sql"
7156                                log.debug(f"Criterion Mode: {criterion_mode}")
7157
7158                                # Criterion parameters
7159                                criterion_type = criterion.get("type", None)
7160                                criterion_value = criterion.get("value", None)
7161                                criterion_sql = criterion.get("sql", None)
7162                                criterion_fields = criterion.get("fields", None)
7163                                criterion_score = criterion.get("score", 0)
7164                                criterion_flag = criterion.get("flag", "PASS")
7165                                criterion_class = criterion.get("class", None)
7166                                criterion_flag_bool = criterion_flag == "PASS"
7167                                criterion_comment = (
7168                                    ", ".join(criterion.get("comment", []))
7169                                    .replace("'", "''")
7170                                    .replace(";", ",")
7171                                    .replace("\t", " ")
7172                                )
7173                                criterion_infos = (
7174                                    str(criterion)
7175                                    .replace("'", "''")
7176                                    .replace(";", ",")
7177                                    .replace("\t", " ")
7178                                )
7179
7180                                # SQL
7181                                if criterion_sql is not None and isinstance(
7182                                    criterion_sql, list
7183                                ):
7184                                    criterion_sql = " ".join(criterion_sql)
7185
7186                                # Fields and explode
7187                                if criterion_fields is None:
7188                                    criterion_fields = [annotation]
7189                                if not isinstance(criterion_fields, list):
7190                                    criterion_fields = str(criterion_fields).split(",")
7191
7192                                # Class
7193                                if criterion_class is not None and not isinstance(
7194                                    criterion_class, list
7195                                ):
7196                                    criterion_class = str(criterion_class).split(",")
7197
7198                                for annotation_field in criterion_fields:
7199
7200                                    # Explode specific annotation
7201                                    log.debug(
7202                                        f"Explode annotation '{annotation_field}'"
7203                                    )
7204                                    added_columns += self.explode_infos(
7205                                        prefix=explode_infos_prefix,
7206                                        fields=[annotation_field],
7207                                        table=table_variants,
7208                                    )
7209                                    extra_infos = self.get_extra_infos(
7210                                        table=table_variants
7211                                    )
7212
7213                                    # Check if annotation field is present
7214                                    if (
7215                                        f"{explode_infos_prefix}{annotation_field}"
7216                                        not in extra_infos
7217                                    ):
7218                                        msq_err = f"Annotation '{annotation_field}' not in data"
7219                                        log.error(msq_err)
7220                                        raise ValueError(msq_err)
7221                                    else:
7222                                        log.debug(
7223                                            f"Annotation '{annotation_field}' in data"
7224                                        )
7225
7226                                sql_set = []
7227                                sql_set_info = []
7228
7229                                # PZ fields set
7230
7231                                # PZScore
7232                                if (
7233                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7234                                    in list_of_pzfields
7235                                ):
7236                                    # if prioritization_score_mode == "HOWARD":
7237                                    #     sql_set.append(
7238                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7239                                    #     )
7240                                    # VaRank prioritization score mode
7241                                    if prioritization_score_mode == "VaRank":
7242                                        sql_set.append(
7243                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7244                                        )
7245                                    # default HOWARD prioritization score mode
7246                                    else:
7247                                        sql_set.append(
7248                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7249                                        )
7250
7251                                # PZFlag
7252                                if (
7253                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7254                                    in list_of_pzfields
7255                                ):
7256                                    sql_set.append(
7257                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7258                                    )
7259
7260                                # PZClass
7261                                if (
7262                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7263                                    in list_of_pzfields
7264                                    and criterion_class is not None
7265                                ):
7266                                    sql_set.append(
7267                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7268                                    )
7269
7270                                # PZComment
7271                                if (
7272                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7273                                    in list_of_pzfields
7274                                ):
7275                                    sql_set.append(
7276                                        f"""
7277                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7278                                                concat(
7279                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7280                                                    CASE 
7281                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7282                                                        THEN ', '
7283                                                        ELSE ''
7284                                                    END,
7285                                                    '{criterion_comment}'
7286                                                )
7287                                        """
7288                                    )
7289
7290                                # PZInfos
7291                                if (
7292                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7293                                    in list_of_pzfields
7294                                ):
7295                                    sql_set.append(
7296                                        f"""
7297                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7298                                                concat(
7299                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7300                                                    '{criterion_infos}'
7301                                                )
7302                                        """
7303                                    )
7304                                sql_set_option = ",".join(sql_set)
7305
7306                                # Criterion and comparison
7307                                if sql_set_option:
7308
7309                                    if criterion_mode in ["operation"]:
7310
7311                                        try:
7312                                            float(criterion_value)
7313                                            sql_update = f"""
7314                                                UPDATE {table_variants}
7315                                                SET {sql_set_option}
7316                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7317                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7318                                            """
7319                                        except:
7320                                            contains_option = ""
7321                                            if criterion_type == "contains":
7322                                                contains_option = ".*"
7323                                            sql_update = f"""
7324                                                UPDATE {table_variants}
7325                                                SET {sql_set_option}
7326                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7327                                            """
7328                                        sql_queries.append(sql_update)
7329
7330                                    elif criterion_mode in ["sql"]:
7331
7332                                        sql_update = f"""
7333                                            UPDATE {table_variants}
7334                                            SET {sql_set_option}
7335                                            WHERE {criterion_sql}
7336                                        """
7337                                        sql_queries.append(sql_update)
7338
7339                                    else:
7340                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7341                                        log.error(msg_err)
7342                                        raise ValueError(msg_err)
7343
7344                                else:
7345                                    log.warning(
7346                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7347                                    )
7348
7349                        # PZTags
7350                        if (
7351                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7352                            in list_of_pzfields
7353                        ):
7354
7355                            # Create PZFalgs value
7356                            pztags_value = ""
7357                            pztags_sep_default = ","
7358                            pztags_sep = ""
7359                            for pzfield in pzfields:
7360                                if pzfield not in [f"{pz_prefix}Tags"]:
7361                                    if (
7362                                        f"{pzfield}{pzfields_sep}{profile}"
7363                                        in list_of_pzfields
7364                                    ):
7365                                        if pzfield in [f"{pz_prefix}Flag"]:
7366                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7367                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7368                                                    THEN 'PASS'
7369                                                    ELSE 'FILTERED'
7370                                                END, '"""
7371                                        elif pzfield in [f"{pz_prefix}Class"]:
7372                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7373                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7374                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7375                                                    ELSE '.'
7376                                                END, '"""
7377                                        else:
7378                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7379                                        pztags_sep = pztags_sep_default
7380
7381                            # Add Query update for PZFlags
7382                            sql_update_pztags = f"""
7383                                UPDATE {table_variants}
7384                                SET INFO = concat(
7385                                        INFO,
7386                                        CASE WHEN INFO NOT in ('','.')
7387                                                THEN ';'
7388                                                ELSE ''
7389                                        END,
7390                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7391                                    )
7392                                """
7393                            sql_queries.append(sql_update_pztags)
7394
7395                            # Add Query update for PZFlags for default
7396                            if profile == default_profile:
7397                                sql_update_pztags_default = f"""
7398                                UPDATE {table_variants}
7399                                SET INFO = concat(
7400                                        INFO,
7401                                        ';',
7402                                        '{pz_prefix}Tags={pztags_value}'
7403                                    )
7404                                """
7405                                sql_queries.append(sql_update_pztags_default)
7406
7407                        log.info(f"""Profile '{profile}' - Prioritization... """)
7408
7409                        if sql_queries:
7410
7411                            for sql_query in sql_queries:
7412                                log.debug(
7413                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7414                                )
7415                                self.conn.execute(sql_query)
7416
7417                        log.info(f"""Profile '{profile}' - Update... """)
7418                        sql_query_update = f"""
7419                            UPDATE {table_variants}
7420                            SET INFO =  
7421                                concat(
7422                                    CASE
7423                                        WHEN INFO NOT IN ('','.')
7424                                        THEN concat(INFO, ';')
7425                                        ELSE ''
7426                                    END
7427                                    {sql_set_info_option}
7428                                )
7429                        """
7430                        self.conn.execute(sql_query_update)
7431
7432        else:
7433
7434            log.warning(f"No profiles in parameters")
7435
7436        # Remove added columns
7437        for added_column in added_columns:
7438            self.drop_column(column=added_column)
7439
7440        # Explode INFOS fields into table fields
7441        if self.get_explode_infos():
7442            self.explode_infos(
7443                prefix=self.get_explode_infos_prefix(),
7444                fields=self.get_explode_infos_fields(),
7445                force=True,
7446            )
7447
7448        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7454    def annotation_hgvs(self, threads: int = None) -> None:
7455        """
7456        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7457        coordinates and alleles.
7458
7459        :param threads: The `threads` parameter is an optional integer that specifies the number of
7460        threads to use for parallel processing. If no value is provided, it will default to the number
7461        of threads obtained from the `get_threads()` method
7462        :type threads: int
7463        """
7464
7465        # Function for each partition of the Dask Dataframe
7466        def partition_function(partition):
7467            """
7468            The function `partition_function` applies the `annotation_hgvs_partition` function to
7469            each row of a DataFrame called `partition`.
7470
7471            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7472            to be processed
7473            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7474            the "partition" dataframe along the axis 1.
7475            """
7476            return partition.apply(annotation_hgvs_partition, axis=1)
7477
7478        def annotation_hgvs_partition(row) -> str:
7479            """
7480            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7481            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7482
7483            :param row: A dictionary-like object that contains the values for the following keys:
7484            :return: a string that contains the HGVS names associated with the given row of data.
7485            """
7486
7487            chr = row["CHROM"]
7488            pos = row["POS"]
7489            ref = row["REF"]
7490            alt = row["ALT"]
7491
7492            # Find list of associated transcripts
7493            transcripts_list = list(
7494                polars_conn.execute(
7495                    f"""
7496                SELECT transcript
7497                FROM refseq_df
7498                WHERE CHROM='{chr}'
7499                AND POS={pos}
7500            """
7501                )["transcript"]
7502            )
7503
7504            # Full HGVS annotation in list
7505            hgvs_full_list = []
7506
7507            for transcript_name in transcripts_list:
7508
7509                # Transcript
7510                transcript = get_transcript(
7511                    transcripts=transcripts, transcript_name=transcript_name
7512                )
7513                # Exon
7514                if use_exon:
7515                    exon = transcript.find_exon_number(pos)
7516                else:
7517                    exon = None
7518                # Protein
7519                transcript_protein = None
7520                if use_protein or add_protein or full_format:
7521                    transcripts_protein = list(
7522                        polars_conn.execute(
7523                            f"""
7524                        SELECT protein
7525                        FROM refseqlink_df
7526                        WHERE transcript='{transcript_name}'
7527                        LIMIT 1
7528                    """
7529                        )["protein"]
7530                    )
7531                    if len(transcripts_protein):
7532                        transcript_protein = transcripts_protein[0]
7533
7534                # HGVS name
7535                hgvs_name = format_hgvs_name(
7536                    chr,
7537                    pos,
7538                    ref,
7539                    alt,
7540                    genome=genome,
7541                    transcript=transcript,
7542                    transcript_protein=transcript_protein,
7543                    exon=exon,
7544                    use_gene=use_gene,
7545                    use_protein=use_protein,
7546                    full_format=full_format,
7547                    use_version=use_version,
7548                    codon_type=codon_type,
7549                )
7550                hgvs_full_list.append(hgvs_name)
7551                if add_protein and not use_protein and not full_format:
7552                    hgvs_name = format_hgvs_name(
7553                        chr,
7554                        pos,
7555                        ref,
7556                        alt,
7557                        genome=genome,
7558                        transcript=transcript,
7559                        transcript_protein=transcript_protein,
7560                        exon=exon,
7561                        use_gene=use_gene,
7562                        use_protein=True,
7563                        full_format=False,
7564                        use_version=use_version,
7565                        codon_type=codon_type,
7566                    )
7567                    hgvs_full_list.append(hgvs_name)
7568
7569            # Create liste of HGVS annotations
7570            hgvs_full = ",".join(hgvs_full_list)
7571
7572            return hgvs_full
7573
7574        # Polars connexion
7575        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7576
7577        # Config
7578        config = self.get_config()
7579
7580        # Databases
7581        # Genome
7582        databases_genomes_folders = (
7583            config.get("folders", {})
7584            .get("databases", {})
7585            .get("genomes", DEFAULT_GENOME_FOLDER)
7586        )
7587        databases_genome = (
7588            config.get("folders", {}).get("databases", {}).get("genomes", "")
7589        )
7590        # refseq database folder
7591        databases_refseq_folders = (
7592            config.get("folders", {})
7593            .get("databases", {})
7594            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7595        )
7596        # refseq
7597        databases_refseq = config.get("databases", {}).get("refSeq", None)
7598        # refSeqLink
7599        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7600
7601        # Param
7602        param = self.get_param()
7603
7604        # Quick HGVS
7605        if "hgvs_options" in param and param.get("hgvs_options", ""):
7606            log.info(f"Quick HGVS Annotation:")
7607            if not param.get("hgvs", None):
7608                param["hgvs"] = {}
7609            for option in param.get("hgvs_options", "").split(","):
7610                option_var_val = option.split("=")
7611                option_var = option_var_val[0]
7612                if len(option_var_val) > 1:
7613                    option_val = option_var_val[1]
7614                else:
7615                    option_val = "True"
7616                if option_val.upper() in ["TRUE"]:
7617                    option_val = True
7618                elif option_val.upper() in ["FALSE"]:
7619                    option_val = False
7620                log.info(f"   {option_var}={option_val}")
7621                param["hgvs"][option_var] = option_val
7622
7623        # Check if HGVS annotation enabled
7624        if "hgvs" in param:
7625            log.info(f"HGVS Annotation... ")
7626            for hgvs_option in param.get("hgvs", {}):
7627                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7628        else:
7629            return
7630
7631        # HGVS Param
7632        param_hgvs = param.get("hgvs", {})
7633        use_exon = param_hgvs.get("use_exon", False)
7634        use_gene = param_hgvs.get("use_gene", False)
7635        use_protein = param_hgvs.get("use_protein", False)
7636        add_protein = param_hgvs.get("add_protein", False)
7637        full_format = param_hgvs.get("full_format", False)
7638        use_version = param_hgvs.get("use_version", False)
7639        codon_type = param_hgvs.get("codon_type", "3")
7640
7641        # refSseq refSeqLink
7642        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7643        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7644
7645        # Assembly
7646        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7647
7648        # Genome
7649        genome_file = None
7650        if find_genome(databases_genome):
7651            genome_file = find_genome(databases_genome)
7652        else:
7653            genome_file = find_genome(
7654                genome_path=databases_genomes_folders, assembly=assembly
7655            )
7656        log.debug("Genome: " + str(genome_file))
7657
7658        # refSseq
7659        refseq_file = find_file_prefix(
7660            input_file=databases_refseq,
7661            prefix="ncbiRefSeq",
7662            folder=databases_refseq_folders,
7663            assembly=assembly,
7664        )
7665        log.debug("refSeq: " + str(refseq_file))
7666
7667        # refSeqLink
7668        refseqlink_file = find_file_prefix(
7669            input_file=databases_refseqlink,
7670            prefix="ncbiRefSeqLink",
7671            folder=databases_refseq_folders,
7672            assembly=assembly,
7673        )
7674        log.debug("refSeqLink: " + str(refseqlink_file))
7675
7676        # Threads
7677        if not threads:
7678            threads = self.get_threads()
7679        log.debug("Threads: " + str(threads))
7680
7681        # Variables
7682        table_variants = self.get_table_variants(clause="update")
7683
7684        # Get variants SNV and InDel only
7685        query_variants = f"""
7686            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7687            FROM {table_variants}
7688            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7689            """
7690        df_variants = self.get_query_to_df(query_variants)
7691
7692        # Added columns
7693        added_columns = []
7694
7695        # Add hgvs column in variants table
7696        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7697        added_column = self.add_column(
7698            table_variants, hgvs_column_name, "STRING", default_value=None
7699        )
7700        added_columns.append(added_column)
7701
7702        log.debug(f"refSeq loading...")
7703        # refSeq in duckDB
7704        refseq_table = get_refseq_table(
7705            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7706        )
7707        # Loading all refSeq in Dataframe
7708        refseq_query = f"""
7709            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7710            FROM {refseq_table}
7711            JOIN df_variants ON (
7712                {refseq_table}.chrom = df_variants.CHROM
7713                AND {refseq_table}.txStart<=df_variants.POS
7714                AND {refseq_table}.txEnd>=df_variants.POS
7715            )
7716        """
7717        refseq_df = self.conn.query(refseq_query).pl()
7718
7719        if refseqlink_file:
7720            log.debug(f"refSeqLink loading...")
7721            # refSeqLink in duckDB
7722            refseqlink_table = get_refseq_table(
7723                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7724            )
7725            # Loading all refSeqLink in Dataframe
7726            protacc_column = "protAcc_with_ver"
7727            mrnaacc_column = "mrnaAcc_with_ver"
7728            refseqlink_query = f"""
7729                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7730                FROM {refseqlink_table} 
7731                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7732                WHERE protAcc_without_ver IS NOT NULL
7733            """
7734            # Polars Dataframe
7735            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7736
7737        # Read RefSeq transcripts into a python dict/model.
7738        log.debug(f"Transcripts loading...")
7739        with tempfile.TemporaryDirectory() as tmpdir:
7740            transcripts_query = f"""
7741                COPY (
7742                    SELECT {refseq_table}.*
7743                    FROM {refseq_table}
7744                    JOIN df_variants ON (
7745                        {refseq_table}.chrom=df_variants.CHROM
7746                        AND {refseq_table}.txStart<=df_variants.POS
7747                        AND {refseq_table}.txEnd>=df_variants.POS
7748                    )
7749                )
7750                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7751            """
7752            self.conn.query(transcripts_query)
7753            with open(f"{tmpdir}/transcript.tsv") as infile:
7754                transcripts = read_transcripts(infile)
7755
7756        # Polars connexion
7757        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7758
7759        log.debug("Genome loading...")
7760        # Read genome sequence using pyfaidx.
7761        genome = Fasta(genome_file)
7762
7763        log.debug("Start annotation HGVS...")
7764
7765        # Create
7766        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7767        ddf = dd.from_pandas(df_variants, npartitions=threads)
7768
7769        # Use dask.dataframe.apply() to apply function on each partition
7770        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7771
7772        # Convert Dask DataFrame to Pandas Dataframe
7773        df = ddf.compute()
7774
7775        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7776        with tempfile.TemporaryDirectory() as tmpdir:
7777            df_parquet = os.path.join(tmpdir, "df.parquet")
7778            df.to_parquet(df_parquet)
7779
7780            # Update hgvs column
7781            update_variant_query = f"""
7782                UPDATE {table_variants}
7783                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7784                FROM read_parquet('{df_parquet}') as df
7785                WHERE variants."#CHROM" = df.CHROM
7786                AND variants.POS = df.POS
7787                AND variants.REF = df.REF
7788                AND variants.ALT = df.ALT
7789                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7790                """
7791            self.execute_query(update_variant_query)
7792
7793        # Update INFO column
7794        sql_query_update = f"""
7795            UPDATE {table_variants}
7796            SET INFO = 
7797                concat(
7798                    CASE 
7799                        WHEN INFO NOT IN ('','.')
7800                        THEN concat(INFO, ';')
7801                        ELSE ''
7802                    END,
7803                    'hgvs=',
7804                    {hgvs_column_name}
7805                )
7806            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7807            """
7808        self.execute_query(sql_query_update)
7809
7810        # Add header
7811        HGVS_INFOS = {
7812            "hgvs": {
7813                "ID": "hgvs",
7814                "Number": ".",
7815                "Type": "String",
7816                "Description": f"HGVS annotatation with HOWARD",
7817            }
7818        }
7819
7820        for field in HGVS_INFOS:
7821            field_ID = HGVS_INFOS[field]["ID"]
7822            field_description = HGVS_INFOS[field]["Description"]
7823            self.get_header().infos[field_ID] = vcf.parser._Info(
7824                field_ID,
7825                HGVS_INFOS[field]["Number"],
7826                HGVS_INFOS[field]["Type"],
7827                field_description,
7828                "unknown",
7829                "unknown",
7830                code_type_map[HGVS_INFOS[field]["Type"]],
7831            )
7832
7833        # Remove added columns
7834        for added_column in added_columns:
7835            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7841    def get_operations_help(
7842        self, operations_config_dict: dict = {}, operations_config_file: str = None
7843    ) -> list:
7844
7845        # Init
7846        operations_help = []
7847
7848        # operations
7849        operations = self.get_config_json(
7850            name="calculations",
7851            config_dict=operations_config_dict,
7852            config_file=operations_config_file,
7853        )
7854        for op in operations:
7855            op_name = operations[op].get("name", op).upper()
7856            op_description = operations[op].get("description", op_name)
7857            op_available = operations[op].get("available", False)
7858            if op_available:
7859                operations_help.append(f"   {op_name}: {op_description}")
7860
7861        # Sort operations
7862        operations_help.sort()
7863
7864        # insert header
7865        operations_help.insert(0, "Available calculation operations:")
7866
7867        # Return
7868        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7870    def calculation(
7871        self,
7872        operations: dict = {},
7873        operations_config_dict: dict = {},
7874        operations_config_file: str = None,
7875    ) -> None:
7876        """
7877        It takes a list of operations, and for each operation, it checks if it's a python or sql
7878        operation, and then calls the appropriate function
7879
7880        param json example:
7881            "calculation": {
7882                "NOMEN": {
7883                    "options": {
7884                        "hgvs_field": "hgvs"
7885                    },
7886                "middle" : null
7887            }
7888        """
7889
7890        # Param
7891        param = self.get_param()
7892
7893        # operations config
7894        operations_config = self.get_config_json(
7895            name="calculations",
7896            config_dict=operations_config_dict,
7897            config_file=operations_config_file,
7898        )
7899
7900        # Upper keys
7901        operations_config = {k.upper(): v for k, v in operations_config.items()}
7902
7903        # Calculations
7904
7905        # Operations from param
7906        operations = param.get("calculation", {}).get("calculations", operations)
7907
7908        # Quick calculation - add
7909        if param.get("calculations", None):
7910            calculations_list = [
7911                value for value in param.get("calculations", "").split(",")
7912            ]
7913            log.info(f"Quick Calculations:")
7914            for calculation_key in calculations_list:
7915                log.info(f"   {calculation_key}")
7916            for calculation_operation in calculations_list:
7917                if calculation_operation.upper() not in operations:
7918                    operations[calculation_operation.upper()] = {}
7919                    add_value_into_dict(
7920                        dict_tree=param,
7921                        sections=[
7922                            "calculation",
7923                            "calculations",
7924                            calculation_operation.upper(),
7925                        ],
7926                        value={},
7927                    )
7928
7929        # Operations for calculation
7930        if not operations:
7931            operations = param.get("calculation", {}).get("calculations", {})
7932
7933        if operations:
7934            log.info(f"Calculations...")
7935
7936        # For each operations
7937        for operation_name in operations:
7938            operation_name = operation_name.upper()
7939            if operation_name not in [""]:
7940                if operation_name in operations_config:
7941                    log.info(f"Calculation '{operation_name}'")
7942                    operation = operations_config[operation_name]
7943                    operation_type = operation.get("type", "sql")
7944                    if operation_type == "python":
7945                        self.calculation_process_function(
7946                            operation=operation, operation_name=operation_name
7947                        )
7948                    elif operation_type == "sql":
7949                        self.calculation_process_sql(
7950                            operation=operation, operation_name=operation_name
7951                        )
7952                    else:
7953                        log.error(
7954                            f"Operations config: Type '{operation_type}' NOT available"
7955                        )
7956                        raise ValueError(
7957                            f"Operations config: Type '{operation_type}' NOT available"
7958                        )
7959                else:
7960                    log.error(
7961                        f"Operations config: Calculation '{operation_name}' NOT available"
7962                    )
7963                    raise ValueError(
7964                        f"Operations config: Calculation '{operation_name}' NOT available"
7965                    )
7966
7967        # Explode INFOS fields into table fields
7968        if self.get_explode_infos():
7969            self.explode_infos(
7970                prefix=self.get_explode_infos_prefix(),
7971                fields=self.get_explode_infos_fields(),
7972                force=True,
7973            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7975    def calculation_process_sql(
7976        self, operation: dict, operation_name: str = "unknown"
7977    ) -> None:
7978        """
7979        The `calculation_process_sql` function takes in a mathematical operation as a string and
7980        performs the operation, updating the specified table with the result.
7981
7982        :param operation: The `operation` parameter is a dictionary that contains information about the
7983        mathematical operation to be performed. It includes the following keys:
7984        :type operation: dict
7985        :param operation_name: The `operation_name` parameter is a string that represents the name of
7986        the mathematical operation being performed. It is used for logging and error handling purposes,
7987        defaults to unknown
7988        :type operation_name: str (optional)
7989        """
7990
7991        # table variants
7992        table_variants = self.get_table_variants(clause="alter")
7993
7994        # Operation infos
7995        operation_name = operation.get("name", "unknown")
7996        log.debug(f"process sql {operation_name}")
7997        output_column_name = operation.get("output_column_name", operation_name)
7998        output_column_type = operation.get("output_column_type", "String")
7999        prefix = operation.get("explode_infos_prefix", "")
8000        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
8001        output_column_description = operation.get(
8002            "output_column_description", f"{operation_name} operation"
8003        )
8004        operation_query = operation.get("operation_query", None)
8005        if isinstance(operation_query, list):
8006            operation_query = " ".join(operation_query)
8007        operation_info_fields = operation.get("info_fields", [])
8008        operation_info_fields_check = operation.get("info_fields_check", False)
8009        operation_info = operation.get("operation_info", True)
8010
8011        if operation_query:
8012
8013            # Info fields check
8014            operation_info_fields_check_result = True
8015            if operation_info_fields_check:
8016                header_infos = self.get_header().infos
8017                for info_field in operation_info_fields:
8018                    operation_info_fields_check_result = (
8019                        operation_info_fields_check_result
8020                        and info_field in header_infos
8021                    )
8022
8023            # If info fields available
8024            if operation_info_fields_check_result:
8025
8026                # Added_columns
8027                added_columns = []
8028
8029                # Create VCF header field
8030                vcf_reader = self.get_header()
8031                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8032                    output_column_name,
8033                    ".",
8034                    output_column_type,
8035                    output_column_description,
8036                    "howard calculation",
8037                    "0",
8038                    self.code_type_map.get(output_column_type),
8039                )
8040
8041                # Explode infos if needed
8042                log.debug(f"calculation_process_sql prefix {prefix}")
8043                added_columns += self.explode_infos(
8044                    prefix=prefix,
8045                    fields=[output_column_name] + operation_info_fields,
8046                    force=True,
8047                )
8048
8049                # Create column
8050                added_column = self.add_column(
8051                    table_name=table_variants,
8052                    column_name=prefix + output_column_name,
8053                    column_type=output_column_type_sql,
8054                    default_value="null",
8055                )
8056                added_columns.append(added_column)
8057
8058                # Operation calculation
8059                try:
8060
8061                    # Query to update calculation column
8062                    sql_update = f"""
8063                        UPDATE {table_variants}
8064                        SET "{prefix}{output_column_name}" = ({operation_query})
8065                    """
8066                    self.conn.execute(sql_update)
8067
8068                    # Add to INFO
8069                    if operation_info:
8070                        sql_update_info = f"""
8071                            UPDATE {table_variants}
8072                            SET "INFO" =
8073                                concat(
8074                                    CASE
8075                                        WHEN "INFO" IS NOT NULL
8076                                        THEN concat("INFO", ';')
8077                                        ELSE ''
8078                                    END,
8079                                    '{output_column_name}=',
8080                                    "{prefix}{output_column_name}"
8081                                )
8082                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8083                        """
8084                        self.conn.execute(sql_update_info)
8085
8086                except:
8087                    log.error(
8088                        f"Operations config: Calculation '{operation_name}' query failed"
8089                    )
8090                    raise ValueError(
8091                        f"Operations config: Calculation '{operation_name}' query failed"
8092                    )
8093
8094                # Remove added columns
8095                for added_column in added_columns:
8096                    log.debug(f"added_column: {added_column}")
8097                    self.drop_column(column=added_column)
8098
8099            else:
8100                log.error(
8101                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8102                )
8103                raise ValueError(
8104                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8105                )
8106
8107        else:
8108            log.error(
8109                f"Operations config: Calculation '{operation_name}' query NOT defined"
8110            )
8111            raise ValueError(
8112                f"Operations config: Calculation '{operation_name}' query NOT defined"
8113            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8115    def calculation_process_function(
8116        self, operation: dict, operation_name: str = "unknown"
8117    ) -> None:
8118        """
8119        The `calculation_process_function` takes in an operation dictionary and performs the specified
8120        function with the given parameters.
8121
8122        :param operation: The `operation` parameter is a dictionary that contains information about the
8123        operation to be performed. It has the following keys:
8124        :type operation: dict
8125        :param operation_name: The `operation_name` parameter is a string that represents the name of
8126        the operation being performed. It is used for logging purposes, defaults to unknown
8127        :type operation_name: str (optional)
8128        """
8129
8130        operation_name = operation["name"]
8131        log.debug(f"process sql {operation_name}")
8132        function_name = operation["function_name"]
8133        function_params = operation["function_params"]
8134        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8136    def calculation_variant_id(self) -> None:
8137        """
8138        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8139        updates the INFO field of a variants table with the variant ID.
8140        """
8141
8142        # variant_id annotation field
8143        variant_id_tag = self.get_variant_id_column()
8144        added_columns = [variant_id_tag]
8145
8146        # variant_id hgvs tags"
8147        vcf_infos_tags = {
8148            variant_id_tag: "howard variant ID annotation",
8149        }
8150
8151        # Variants table
8152        table_variants = self.get_table_variants()
8153
8154        # Header
8155        vcf_reader = self.get_header()
8156
8157        # Add variant_id to header
8158        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8159            variant_id_tag,
8160            ".",
8161            "String",
8162            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8163            "howard calculation",
8164            "0",
8165            self.code_type_map.get("String"),
8166        )
8167
8168        # Update
8169        sql_update = f"""
8170            UPDATE {table_variants}
8171            SET "INFO" = 
8172                concat(
8173                    CASE
8174                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8175                        THEN ''
8176                        ELSE concat("INFO", ';')
8177                    END,
8178                    '{variant_id_tag}=',
8179                    "{variant_id_tag}"
8180                )
8181        """
8182        self.conn.execute(sql_update)
8183
8184        # Remove added columns
8185        for added_column in added_columns:
8186            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8188    def calculation_extract_snpeff_hgvs(
8189        self,
8190        snpeff_hgvs: str = "snpeff_hgvs",
8191        snpeff_field: str = "ANN",
8192    ) -> None:
8193        """
8194        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8195        annotation field in a VCF file and adds them as a new column in the variants table.
8196
8197        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8198        function is used to specify the name of the column that will store the HGVS nomenclatures
8199        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8200        snpeff_hgvs
8201        :type snpeff_hgvs: str (optional)
8202        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8203        function represents the field in the VCF file that contains SnpEff annotations. This field is
8204        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8205        to ANN
8206        :type snpeff_field: str (optional)
8207        """
8208
8209        # Snpeff hgvs tags
8210        vcf_infos_tags = {
8211            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8212        }
8213
8214        # Prefix
8215        prefix = self.get_explode_infos_prefix()
8216        if prefix:
8217            prefix = "INFO/"
8218
8219        # snpEff fields
8220        speff_ann_infos = prefix + snpeff_field
8221        speff_hgvs_infos = prefix + snpeff_hgvs
8222
8223        # Variants table
8224        table_variants = self.get_table_variants()
8225
8226        # Header
8227        vcf_reader = self.get_header()
8228
8229        # Add columns
8230        added_columns = []
8231
8232        # Explode HGVS field in column
8233        added_columns += self.explode_infos(fields=[snpeff_field])
8234
8235        if snpeff_field in vcf_reader.infos:
8236
8237            log.debug(vcf_reader.infos[snpeff_field])
8238
8239            # Extract ANN header
8240            ann_description = vcf_reader.infos[snpeff_field].desc
8241            pattern = r"'(.+?)'"
8242            match = re.search(pattern, ann_description)
8243            if match:
8244                ann_header_match = match.group(1).split(" | ")
8245                ann_header_desc = {}
8246                for i in range(len(ann_header_match)):
8247                    ann_header_info = "".join(
8248                        char for char in ann_header_match[i] if char.isalnum()
8249                    )
8250                    ann_header_desc[ann_header_info] = ann_header_match[i]
8251                if not ann_header_desc:
8252                    raise ValueError("Invalid header description format")
8253            else:
8254                raise ValueError("Invalid header description format")
8255
8256            # Create variant id
8257            variant_id_column = self.get_variant_id_column()
8258            added_columns += [variant_id_column]
8259
8260            # Create dataframe
8261            dataframe_snpeff_hgvs = self.get_query_to_df(
8262                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8263            )
8264
8265            # Create main NOMEN column
8266            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8267                speff_ann_infos
8268            ].apply(
8269                lambda x: extract_snpeff_hgvs(
8270                    str(x), header=list(ann_header_desc.values())
8271                )
8272            )
8273
8274            # Add snpeff_hgvs to header
8275            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8276                snpeff_hgvs,
8277                ".",
8278                "String",
8279                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8280                "howard calculation",
8281                "0",
8282                self.code_type_map.get("String"),
8283            )
8284
8285            # Update
8286            sql_update = f"""
8287                UPDATE variants
8288                SET "INFO" = 
8289                    concat(
8290                        CASE
8291                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8292                            THEN ''
8293                            ELSE concat("INFO", ';')
8294                        END,
8295                        CASE 
8296                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8297                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8298                            THEN concat(
8299                                    '{snpeff_hgvs}=',
8300                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8301                                )
8302                            ELSE ''
8303                        END
8304                    )
8305                FROM dataframe_snpeff_hgvs
8306                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8307
8308            """
8309            self.conn.execute(sql_update)
8310
8311            # Delete dataframe
8312            del dataframe_snpeff_hgvs
8313            gc.collect()
8314
8315        else:
8316
8317            log.warning(
8318                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8319            )
8320
8321        # Remove added columns
8322        for added_column in added_columns:
8323            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8325    def calculation_snpeff_ann_explode(
8326        self,
8327        uniquify: bool = True,
8328        output_format: str = "fields",
8329        output_prefix: str = "snpeff_",
8330        snpeff_field: str = "ANN",
8331    ) -> None:
8332        """
8333        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8334        exploding the HGVS field and updating variant information accordingly.
8335
8336        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8337        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8338        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8339        defaults to True
8340        :type uniquify: bool (optional)
8341        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8342        function specifies the format in which the output annotations will be generated. It has a
8343        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8344        format, defaults to fields
8345        :type output_format: str (optional)
8346        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8347        method is used to specify the prefix that will be added to the output annotations generated
8348        during the calculation process. This prefix helps to differentiate the newly added annotations
8349        from existing ones in the output data. By default, the, defaults to ANN_
8350        :type output_prefix: str (optional)
8351        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8352        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8353        field will be processed to explode the HGVS annotations and update the variant information
8354        accordingly, defaults to ANN
8355        :type snpeff_field: str (optional)
8356        """
8357
8358        # SnpEff annotation field
8359        snpeff_hgvs = "snpeff_ann_explode"
8360
8361        # Snpeff hgvs tags
8362        vcf_infos_tags = {
8363            snpeff_hgvs: "Explode snpEff annotations",
8364        }
8365
8366        # Prefix
8367        prefix = self.get_explode_infos_prefix()
8368        if prefix:
8369            prefix = "INFO/"
8370
8371        # snpEff fields
8372        speff_ann_infos = prefix + snpeff_field
8373        speff_hgvs_infos = prefix + snpeff_hgvs
8374
8375        # Variants table
8376        table_variants = self.get_table_variants()
8377
8378        # Header
8379        vcf_reader = self.get_header()
8380
8381        # Add columns
8382        added_columns = []
8383
8384        # Explode HGVS field in column
8385        added_columns += self.explode_infos(fields=[snpeff_field])
8386        log.debug(f"snpeff_field={snpeff_field}")
8387        log.debug(f"added_columns={added_columns}")
8388
8389        if snpeff_field in vcf_reader.infos:
8390
8391            # Extract ANN header
8392            ann_description = vcf_reader.infos[snpeff_field].desc
8393            pattern = r"'(.+?)'"
8394            match = re.search(pattern, ann_description)
8395            if match:
8396                ann_header_match = match.group(1).split(" | ")
8397                ann_header = []
8398                ann_header_desc = {}
8399                for i in range(len(ann_header_match)):
8400                    ann_header_info = "".join(
8401                        char for char in ann_header_match[i] if char.isalnum()
8402                    )
8403                    ann_header.append(ann_header_info)
8404                    ann_header_desc[ann_header_info] = ann_header_match[i]
8405                if not ann_header_desc:
8406                    raise ValueError("Invalid header description format")
8407            else:
8408                raise ValueError("Invalid header description format")
8409
8410            # Create variant id
8411            variant_id_column = self.get_variant_id_column()
8412            added_columns += [variant_id_column]
8413
8414            # Create dataframe
8415            dataframe_snpeff_hgvs = self.get_query_to_df(
8416                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8417            )
8418
8419            # Create snpEff columns
8420            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8421                speff_ann_infos
8422            ].apply(
8423                lambda x: explode_snpeff_ann(
8424                    str(x),
8425                    uniquify=uniquify,
8426                    output_format=output_format,
8427                    prefix=output_prefix,
8428                    header=list(ann_header_desc.values()),
8429                )
8430            )
8431
8432            # Header
8433            ann_annotations_prefix = ""
8434            if output_format.upper() in ["JSON"]:
8435                ann_annotations_prefix = f"{output_prefix}="
8436                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8437                    output_prefix,
8438                    ".",
8439                    "String",
8440                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8441                    + " - JSON format",
8442                    "howard calculation",
8443                    "0",
8444                    self.code_type_map.get("String"),
8445                )
8446            else:
8447                for ann_annotation in ann_header:
8448                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8449                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8450                        ann_annotation_id,
8451                        ".",
8452                        "String",
8453                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8454                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8455                        "howard calculation",
8456                        "0",
8457                        self.code_type_map.get("String"),
8458                    )
8459
8460            # Update
8461            sql_update = f"""
8462                UPDATE variants
8463                SET "INFO" = 
8464                    concat(
8465                        CASE
8466                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8467                            THEN ''
8468                            ELSE concat("INFO", ';')
8469                        END,
8470                        CASE 
8471                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8472                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8473                            THEN concat(
8474                                '{ann_annotations_prefix}',
8475                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8476                                )
8477                            ELSE ''
8478                        END
8479                    )
8480                FROM dataframe_snpeff_hgvs
8481                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8482
8483            """
8484            self.conn.execute(sql_update)
8485
8486            # Delete dataframe
8487            del dataframe_snpeff_hgvs
8488            gc.collect()
8489
8490        else:
8491
8492            log.warning(
8493                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8494            )
8495
8496        # Remove added columns
8497        for added_column in added_columns:
8498            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8500    def calculation_extract_nomen(self) -> None:
8501        """
8502        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8503        """
8504
8505        # NOMEN field
8506        field_nomen_dict = "NOMEN_DICT"
8507
8508        # NOMEN structure
8509        nomen_dict = {
8510            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8511            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8512            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8513            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8514            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8515            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8516            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8517            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8518            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8519            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8520        }
8521
8522        # Param
8523        param = self.get_param()
8524
8525        # Prefix
8526        prefix = self.get_explode_infos_prefix()
8527
8528        # Header
8529        vcf_reader = self.get_header()
8530
8531        # Get HGVS field
8532        hgvs_field = (
8533            param.get("calculation", {})
8534            .get("calculations", {})
8535            .get("NOMEN", {})
8536            .get("options", {})
8537            .get("hgvs_field", "hgvs")
8538        )
8539
8540        # Get transcripts
8541        transcripts_file = (
8542            param.get("calculation", {})
8543            .get("calculations", {})
8544            .get("NOMEN", {})
8545            .get("options", {})
8546            .get("transcripts", None)
8547        )
8548        transcripts_file = full_path(transcripts_file)
8549        transcripts = []
8550        if transcripts_file:
8551            if os.path.exists(transcripts_file):
8552                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8553                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8554            else:
8555                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8556                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8557
8558        # Added columns
8559        added_columns = []
8560
8561        # Explode HGVS field in column
8562        added_columns += self.explode_infos(fields=[hgvs_field])
8563
8564        # extra infos
8565        extra_infos = self.get_extra_infos()
8566        extra_field = prefix + hgvs_field
8567
8568        if extra_field in extra_infos:
8569
8570            # Create dataframe
8571            dataframe_hgvs = self.get_query_to_df(
8572                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8573            )
8574
8575            # Create main NOMEN column
8576            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8577                lambda x: find_nomen(str(x), transcripts=transcripts)
8578            )
8579
8580            # Explode NOMEN Structure and create SQL set for update
8581            sql_nomen_fields = []
8582            for nomen_field in nomen_dict:
8583
8584                # Explode each field into a column
8585                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8586                    lambda x: dict(x).get(nomen_field, "")
8587                )
8588
8589                # Create VCF header field
8590                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8591                    nomen_field,
8592                    ".",
8593                    "String",
8594                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8595                    "howard calculation",
8596                    "0",
8597                    self.code_type_map.get("String"),
8598                )
8599                sql_nomen_fields.append(
8600                    f"""
8601                        CASE 
8602                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8603                            THEN concat(
8604                                    ';{nomen_field}=',
8605                                    dataframe_hgvs."{nomen_field}"
8606                                )
8607                            ELSE ''
8608                        END
8609                    """
8610                )
8611
8612            # SQL set for update
8613            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8614
8615            # Update
8616            sql_update = f"""
8617                UPDATE variants
8618                SET "INFO" = 
8619                    concat(
8620                        CASE
8621                            WHEN "INFO" IS NULL
8622                            THEN ''
8623                            ELSE "INFO"
8624                        END,
8625                        {sql_nomen_fields_set}
8626                    )
8627                FROM dataframe_hgvs
8628                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8629                    AND variants."POS" = dataframe_hgvs."POS" 
8630                    AND variants."REF" = dataframe_hgvs."REF"
8631                    AND variants."ALT" = dataframe_hgvs."ALT"
8632            """
8633            self.conn.execute(sql_update)
8634
8635            # Delete dataframe
8636            del dataframe_hgvs
8637            gc.collect()
8638
8639        # Remove added columns
8640        for added_column in added_columns:
8641            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8643    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8644        """
8645        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8646        pipeline/sample for a variant and updates the variant information in a VCF file.
8647
8648        :param tag: The `tag` parameter is a string that represents the annotation field for the
8649        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8650        VCF header and to update the corresponding field in the variants table, defaults to
8651        findbypipeline
8652        :type tag: str (optional)
8653        """
8654
8655        # if FORMAT and samples
8656        if (
8657            "FORMAT" in self.get_header_columns_as_list()
8658            and self.get_header_sample_list()
8659        ):
8660
8661            # findbypipeline annotation field
8662            findbypipeline_tag = tag
8663
8664            # VCF infos tags
8665            vcf_infos_tags = {
8666                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8667            }
8668
8669            # Prefix
8670            prefix = self.get_explode_infos_prefix()
8671
8672            # Field
8673            findbypipeline_infos = prefix + findbypipeline_tag
8674
8675            # Variants table
8676            table_variants = self.get_table_variants()
8677
8678            # Header
8679            vcf_reader = self.get_header()
8680
8681            # Create variant id
8682            variant_id_column = self.get_variant_id_column()
8683            added_columns = [variant_id_column]
8684
8685            # variant_id, FORMAT and samples
8686            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8687                self.get_header_sample_list()
8688            )
8689
8690            # Create dataframe
8691            dataframe_findbypipeline = self.get_query_to_df(
8692                f""" SELECT {samples_fields} FROM {table_variants} """
8693            )
8694
8695            # Create findbypipeline column
8696            dataframe_findbypipeline[findbypipeline_infos] = (
8697                dataframe_findbypipeline.apply(
8698                    lambda row: findbypipeline(
8699                        row, samples=self.get_header_sample_list()
8700                    ),
8701                    axis=1,
8702                )
8703            )
8704
8705            # Add snpeff_hgvs to header
8706            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8707                findbypipeline_tag,
8708                ".",
8709                "String",
8710                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8711                "howard calculation",
8712                "0",
8713                self.code_type_map.get("String"),
8714            )
8715
8716            # Update
8717            sql_update = f"""
8718                UPDATE variants
8719                SET "INFO" = 
8720                    concat(
8721                        CASE
8722                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8723                            THEN ''
8724                            ELSE concat("INFO", ';')
8725                        END,
8726                        CASE 
8727                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8728                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8729                            THEN concat(
8730                                    '{findbypipeline_tag}=',
8731                                    dataframe_findbypipeline."{findbypipeline_infos}"
8732                                )
8733                            ELSE ''
8734                        END
8735                    )
8736                FROM dataframe_findbypipeline
8737                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8738            """
8739            self.conn.execute(sql_update)
8740
8741            # Remove added columns
8742            for added_column in added_columns:
8743                self.drop_column(column=added_column)
8744
8745            # Delete dataframe
8746            del dataframe_findbypipeline
8747            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8749    def calculation_genotype_concordance(self) -> None:
8750        """
8751        The function `calculation_genotype_concordance` calculates the genotype concordance for
8752        multi-caller VCF files and updates the variant information in the database.
8753        """
8754
8755        # if FORMAT and samples
8756        if (
8757            "FORMAT" in self.get_header_columns_as_list()
8758            and self.get_header_sample_list()
8759        ):
8760
8761            # genotypeconcordance annotation field
8762            genotypeconcordance_tag = "genotypeconcordance"
8763
8764            # VCF infos tags
8765            vcf_infos_tags = {
8766                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8767            }
8768
8769            # Prefix
8770            prefix = self.get_explode_infos_prefix()
8771
8772            # Field
8773            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8774
8775            # Variants table
8776            table_variants = self.get_table_variants()
8777
8778            # Header
8779            vcf_reader = self.get_header()
8780
8781            # Create variant id
8782            variant_id_column = self.get_variant_id_column()
8783            added_columns = [variant_id_column]
8784
8785            # variant_id, FORMAT and samples
8786            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8787                self.get_header_sample_list()
8788            )
8789
8790            # Create dataframe
8791            dataframe_genotypeconcordance = self.get_query_to_df(
8792                f""" SELECT {samples_fields} FROM {table_variants} """
8793            )
8794
8795            # Create genotypeconcordance column
8796            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8797                dataframe_genotypeconcordance.apply(
8798                    lambda row: genotypeconcordance(
8799                        row, samples=self.get_header_sample_list()
8800                    ),
8801                    axis=1,
8802                )
8803            )
8804
8805            # Add genotypeconcordance to header
8806            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8807                genotypeconcordance_tag,
8808                ".",
8809                "String",
8810                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8811                "howard calculation",
8812                "0",
8813                self.code_type_map.get("String"),
8814            )
8815
8816            # Update
8817            sql_update = f"""
8818                UPDATE variants
8819                SET "INFO" = 
8820                    concat(
8821                        CASE
8822                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8823                            THEN ''
8824                            ELSE concat("INFO", ';')
8825                        END,
8826                        CASE
8827                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8828                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8829                            THEN concat(
8830                                    '{genotypeconcordance_tag}=',
8831                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8832                                )
8833                            ELSE ''
8834                        END
8835                    )
8836                FROM dataframe_genotypeconcordance
8837                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8838            """
8839            self.conn.execute(sql_update)
8840
8841            # Remove added columns
8842            for added_column in added_columns:
8843                self.drop_column(column=added_column)
8844
8845            # Delete dataframe
8846            del dataframe_genotypeconcordance
8847            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8849    def calculation_barcode(self, tag: str = "barcode") -> None:
8850        """
8851        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8852        updates the INFO field in the file with the calculated barcode values.
8853
8854        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8855        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8856        the default tag name is set to "barcode", defaults to barcode
8857        :type tag: str (optional)
8858        """
8859
8860        # if FORMAT and samples
8861        if (
8862            "FORMAT" in self.get_header_columns_as_list()
8863            and self.get_header_sample_list()
8864        ):
8865
8866            # barcode annotation field
8867            if not tag:
8868                tag = "barcode"
8869
8870            # VCF infos tags
8871            vcf_infos_tags = {
8872                tag: "barcode calculation (VaRank)",
8873            }
8874
8875            # Prefix
8876            prefix = self.get_explode_infos_prefix()
8877
8878            # Field
8879            barcode_infos = prefix + tag
8880
8881            # Variants table
8882            table_variants = self.get_table_variants()
8883
8884            # Header
8885            vcf_reader = self.get_header()
8886
8887            # Create variant id
8888            variant_id_column = self.get_variant_id_column()
8889            added_columns = [variant_id_column]
8890
8891            # variant_id, FORMAT and samples
8892            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8893                self.get_header_sample_list()
8894            )
8895
8896            # Create dataframe
8897            dataframe_barcode = self.get_query_to_df(
8898                f""" SELECT {samples_fields} FROM {table_variants} """
8899            )
8900
8901            # Create barcode column
8902            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8903                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8904            )
8905
8906            # Add barcode to header
8907            vcf_reader.infos[tag] = vcf.parser._Info(
8908                tag,
8909                ".",
8910                "String",
8911                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8912                "howard calculation",
8913                "0",
8914                self.code_type_map.get("String"),
8915            )
8916
8917            # Update
8918            sql_update = f"""
8919                UPDATE {table_variants}
8920                SET "INFO" = 
8921                    concat(
8922                        CASE
8923                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8924                            THEN ''
8925                            ELSE concat("INFO", ';')
8926                        END,
8927                        CASE
8928                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8929                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8930                            THEN concat(
8931                                    '{tag}=',
8932                                    dataframe_barcode."{barcode_infos}"
8933                                )
8934                            ELSE ''
8935                        END
8936                    )
8937                FROM dataframe_barcode
8938                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8939            """
8940            self.conn.execute(sql_update)
8941
8942            # Remove added columns
8943            for added_column in added_columns:
8944                self.drop_column(column=added_column)
8945
8946            # Delete dataframe
8947            del dataframe_barcode
8948            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8950    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8951        """
8952        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8953        and updates the INFO field in the file with the calculated barcode values.
8954
8955        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8956        the barcode tag that will be added to the VCF file during the calculation process. If no value
8957        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8958        :type tag: str (optional)
8959        """
8960
8961        # if FORMAT and samples
8962        if (
8963            "FORMAT" in self.get_header_columns_as_list()
8964            and self.get_header_sample_list()
8965        ):
8966
8967            # barcode annotation field
8968            if not tag:
8969                tag = "BCF"
8970
8971            # VCF infos tags
8972            vcf_infos_tags = {
8973                tag: "barcode family calculation",
8974                f"{tag}S": "barcode family samples",
8975            }
8976
8977            # Param
8978            param = self.get_param()
8979            log.debug(f"param={param}")
8980
8981            # Prefix
8982            prefix = self.get_explode_infos_prefix()
8983
8984            # PED param
8985            ped = (
8986                param.get("calculation", {})
8987                .get("calculations", {})
8988                .get("BARCODEFAMILY", {})
8989                .get("family_pedigree", None)
8990            )
8991            log.debug(f"ped={ped}")
8992
8993            # Load PED
8994            if ped:
8995
8996                # Pedigree is a file
8997                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8998                    log.debug("Pedigree is file")
8999                    with open(full_path(ped)) as ped:
9000                        ped = json.load(ped)
9001
9002                # Pedigree is a string
9003                elif isinstance(ped, str):
9004                    log.debug("Pedigree is str")
9005                    try:
9006                        ped = json.loads(ped)
9007                        log.debug("Pedigree is json str")
9008                    except ValueError as e:
9009                        ped_samples = ped.split(",")
9010                        ped = {}
9011                        for ped_sample in ped_samples:
9012                            ped[ped_sample] = ped_sample
9013
9014                # Pedigree is a dict
9015                elif isinstance(ped, dict):
9016                    log.debug("Pedigree is dict")
9017
9018                # Pedigree is not well formatted
9019                else:
9020                    msg_error = "Pedigree not well formatted"
9021                    log.error(msg_error)
9022                    raise ValueError(msg_error)
9023
9024                # Construct list
9025                ped_samples = list(ped.values())
9026
9027            else:
9028                log.debug("Pedigree not defined. Take all samples")
9029                ped_samples = self.get_header_sample_list()
9030                ped = {}
9031                for ped_sample in ped_samples:
9032                    ped[ped_sample] = ped_sample
9033
9034            # Check pedigree
9035            if not ped or len(ped) == 0:
9036                msg_error = f"Error in pedigree: samples {ped_samples}"
9037                log.error(msg_error)
9038                raise ValueError(msg_error)
9039
9040            # Log
9041            log.info(
9042                "Calculation 'BARCODEFAMILY' - Samples: "
9043                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9044            )
9045            log.debug(f"ped_samples={ped_samples}")
9046
9047            # Field
9048            barcode_infos = prefix + tag
9049
9050            # Variants table
9051            table_variants = self.get_table_variants()
9052
9053            # Header
9054            vcf_reader = self.get_header()
9055
9056            # Create variant id
9057            variant_id_column = self.get_variant_id_column()
9058            added_columns = [variant_id_column]
9059
9060            # variant_id, FORMAT and samples
9061            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9062                ped_samples
9063            )
9064
9065            # Create dataframe
9066            dataframe_barcode = self.get_query_to_df(
9067                f""" SELECT {samples_fields} FROM {table_variants} """
9068            )
9069
9070            # Create barcode column
9071            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9072                lambda row: barcode(row, samples=ped_samples), axis=1
9073            )
9074
9075            # Add barcode family to header
9076            # Add vaf_normalization to header
9077            vcf_reader.formats[tag] = vcf.parser._Format(
9078                id=tag,
9079                num=".",
9080                type="String",
9081                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9082                type_code=self.code_type_map.get("String"),
9083            )
9084            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9085                id=f"{tag}S",
9086                num=".",
9087                type="String",
9088                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9089                type_code=self.code_type_map.get("String"),
9090            )
9091
9092            # Update
9093            # for sample in ped_samples:
9094            sql_update_set = []
9095            for sample in self.get_header_sample_list() + ["FORMAT"]:
9096                if sample in ped_samples:
9097                    value = f'dataframe_barcode."{barcode_infos}"'
9098                    value_samples = "'" + ",".join(ped_samples) + "'"
9099                elif sample == "FORMAT":
9100                    value = f"'{tag}'"
9101                    value_samples = f"'{tag}S'"
9102                else:
9103                    value = "'.'"
9104                    value_samples = "'.'"
9105                format_regex = r"[a-zA-Z0-9\s]"
9106                sql_update_set.append(
9107                    f"""
9108                        "{sample}" = 
9109                        concat(
9110                            CASE
9111                                WHEN {table_variants}."{sample}" = './.'
9112                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9113                                ELSE {table_variants}."{sample}"
9114                            END,
9115                            ':',
9116                            {value},
9117                            ':',
9118                            {value_samples}
9119                        )
9120                    """
9121                )
9122
9123            sql_update_set_join = ", ".join(sql_update_set)
9124            sql_update = f"""
9125                UPDATE {table_variants}
9126                SET {sql_update_set_join}
9127                FROM dataframe_barcode
9128                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9129            """
9130            self.conn.execute(sql_update)
9131
9132            # Remove added columns
9133            for added_column in added_columns:
9134                self.drop_column(column=added_column)
9135
9136            # Delete dataframe
9137            del dataframe_barcode
9138            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9140    def calculation_trio(self) -> None:
9141        """
9142        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9143        information to the INFO field of each variant.
9144        """
9145
9146        # if FORMAT and samples
9147        if (
9148            "FORMAT" in self.get_header_columns_as_list()
9149            and self.get_header_sample_list()
9150        ):
9151
9152            # trio annotation field
9153            trio_tag = "trio"
9154
9155            # VCF infos tags
9156            vcf_infos_tags = {
9157                "trio": "trio calculation",
9158            }
9159
9160            # Param
9161            param = self.get_param()
9162
9163            # Prefix
9164            prefix = self.get_explode_infos_prefix()
9165
9166            # Trio param
9167            trio_ped = (
9168                param.get("calculation", {})
9169                .get("calculations", {})
9170                .get("TRIO", {})
9171                .get("trio_pedigree", None)
9172            )
9173
9174            # Load trio
9175            if trio_ped:
9176
9177                # Trio pedigree is a file
9178                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9179                    log.debug("TRIO pedigree is file")
9180                    with open(full_path(trio_ped)) as trio_ped:
9181                        trio_ped = json.load(trio_ped)
9182
9183                # Trio pedigree is a string
9184                elif isinstance(trio_ped, str):
9185                    log.debug("TRIO pedigree is str")
9186                    try:
9187                        trio_ped = json.loads(trio_ped)
9188                        log.debug("TRIO pedigree is json str")
9189                    except ValueError as e:
9190                        trio_samples = trio_ped.split(",")
9191                        if len(trio_samples) == 3:
9192                            trio_ped = {
9193                                "father": trio_samples[0],
9194                                "mother": trio_samples[1],
9195                                "child": trio_samples[2],
9196                            }
9197                            log.debug("TRIO pedigree is list str")
9198                        else:
9199                            msg_error = "TRIO pedigree not well formatted"
9200                            log.error(msg_error)
9201                            raise ValueError(msg_error)
9202
9203                # Trio pedigree is a dict
9204                elif isinstance(trio_ped, dict):
9205                    log.debug("TRIO pedigree is dict")
9206
9207                # Trio pedigree is not well formatted
9208                else:
9209                    msg_error = "TRIO pedigree not well formatted"
9210                    log.error(msg_error)
9211                    raise ValueError(msg_error)
9212
9213                # Construct trio list
9214                trio_samples = [
9215                    trio_ped.get("father", ""),
9216                    trio_ped.get("mother", ""),
9217                    trio_ped.get("child", ""),
9218                ]
9219
9220            else:
9221                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9222                samples_list = self.get_header_sample_list()
9223                if len(samples_list) >= 3:
9224                    trio_samples = self.get_header_sample_list()[0:3]
9225                    trio_ped = {
9226                        "father": trio_samples[0],
9227                        "mother": trio_samples[1],
9228                        "child": trio_samples[2],
9229                    }
9230                else:
9231                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9232                    log.error(msg_error)
9233                    raise ValueError(msg_error)
9234
9235            # Check trio pedigree
9236            if not trio_ped or len(trio_ped) != 3:
9237                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9238                log.error(msg_error)
9239                raise ValueError(msg_error)
9240
9241            # Log
9242            log.info(
9243                f"Calculation 'TRIO' - Samples: "
9244                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9245            )
9246
9247            # Field
9248            trio_infos = prefix + trio_tag
9249
9250            # Variants table
9251            table_variants = self.get_table_variants()
9252
9253            # Header
9254            vcf_reader = self.get_header()
9255
9256            # Create variant id
9257            variant_id_column = self.get_variant_id_column()
9258            added_columns = [variant_id_column]
9259
9260            # variant_id, FORMAT and samples
9261            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9262                self.get_header_sample_list()
9263            )
9264
9265            # Create dataframe
9266            dataframe_trio = self.get_query_to_df(
9267                f""" SELECT {samples_fields} FROM {table_variants} """
9268            )
9269
9270            # Create trio column
9271            dataframe_trio[trio_infos] = dataframe_trio.apply(
9272                lambda row: trio(row, samples=trio_samples), axis=1
9273            )
9274
9275            # Add trio to header
9276            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9277                trio_tag,
9278                ".",
9279                "String",
9280                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9281                "howard calculation",
9282                "0",
9283                self.code_type_map.get("String"),
9284            )
9285
9286            # Update
9287            sql_update = f"""
9288                UPDATE {table_variants}
9289                SET "INFO" = 
9290                    concat(
9291                        CASE
9292                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9293                            THEN ''
9294                            ELSE concat("INFO", ';')
9295                        END,
9296                        CASE
9297                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9298                             AND dataframe_trio."{trio_infos}" NOT NULL
9299                            THEN concat(
9300                                    '{trio_tag}=',
9301                                    dataframe_trio."{trio_infos}"
9302                                )
9303                            ELSE ''
9304                        END
9305                    )
9306                FROM dataframe_trio
9307                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9308            """
9309            self.conn.execute(sql_update)
9310
9311            # Remove added columns
9312            for added_column in added_columns:
9313                self.drop_column(column=added_column)
9314
9315            # Delete dataframe
9316            del dataframe_trio
9317            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9319    def calculation_vaf_normalization(self) -> None:
9320        """
9321        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9322        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9323        :return: The function does not return anything.
9324        """
9325
9326        # if FORMAT and samples
9327        if (
9328            "FORMAT" in self.get_header_columns_as_list()
9329            and self.get_header_sample_list()
9330        ):
9331
9332            # vaf_normalization annotation field
9333            vaf_normalization_tag = "VAF"
9334
9335            # VCF infos tags
9336            vcf_infos_tags = {
9337                "VAF": "VAF Variant Frequency",
9338            }
9339
9340            # Prefix
9341            prefix = self.get_explode_infos_prefix()
9342
9343            # Variants table
9344            table_variants = self.get_table_variants()
9345
9346            # Header
9347            vcf_reader = self.get_header()
9348
9349            # Do not calculate if VAF already exists
9350            if "VAF" in vcf_reader.formats:
9351                log.debug("VAF already on genotypes")
9352                return
9353
9354            # Create variant id
9355            variant_id_column = self.get_variant_id_column()
9356            added_columns = [variant_id_column]
9357
9358            # variant_id, FORMAT and samples
9359            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9360                f""" "{sample}" """ for sample in self.get_header_sample_list()
9361            )
9362
9363            # Create dataframe
9364            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9365            log.debug(f"query={query}")
9366            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9367
9368            vaf_normalization_set = []
9369
9370            # for each sample vaf_normalization
9371            for sample in self.get_header_sample_list():
9372                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9373                    lambda row: vaf_normalization(row, sample=sample), axis=1
9374                )
9375                vaf_normalization_set.append(
9376                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9377                )
9378
9379            # Add VAF to FORMAT
9380            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9381                "FORMAT"
9382            ].apply(lambda x: str(x) + ":VAF")
9383            vaf_normalization_set.append(
9384                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9385            )
9386
9387            # Add vaf_normalization to header
9388            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9389                id=vaf_normalization_tag,
9390                num="1",
9391                type="Float",
9392                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9393                type_code=self.code_type_map.get("Float"),
9394            )
9395
9396            # Create fields to add in INFO
9397            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9398
9399            # Update
9400            sql_update = f"""
9401                UPDATE {table_variants}
9402                SET {sql_vaf_normalization_set}
9403                FROM dataframe_vaf_normalization
9404                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9405
9406            """
9407            self.conn.execute(sql_update)
9408
9409            # Remove added columns
9410            for added_column in added_columns:
9411                self.drop_column(column=added_column)
9412
9413            # Delete dataframe
9414            del dataframe_vaf_normalization
9415            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9417    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9418        """
9419        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9420        field in a VCF file and updates the INFO column of the variants table with the calculated
9421        statistics.
9422
9423        :param info: The `info` parameter is a string that represents the type of information for which
9424        genotype statistics are calculated. It is used to generate various VCF info tags for the
9425        statistics, such as the number of occurrences, the list of values, the minimum value, the
9426        maximum value, the mean, the median, defaults to VAF
9427        :type info: str (optional)
9428        """
9429
9430        # if FORMAT and samples
9431        if (
9432            "FORMAT" in self.get_header_columns_as_list()
9433            and self.get_header_sample_list()
9434        ):
9435
9436            # vaf_stats annotation field
9437            vaf_stats_tag = info + "_stats"
9438
9439            # VCF infos tags
9440            vcf_infos_tags = {
9441                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9442                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9443                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9444                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9445                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9446                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9447                info
9448                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9449            }
9450
9451            # Prefix
9452            prefix = self.get_explode_infos_prefix()
9453
9454            # Field
9455            vaf_stats_infos = prefix + vaf_stats_tag
9456
9457            # Variants table
9458            table_variants = self.get_table_variants()
9459
9460            # Header
9461            vcf_reader = self.get_header()
9462
9463            # Create variant id
9464            variant_id_column = self.get_variant_id_column()
9465            added_columns = [variant_id_column]
9466
9467            # variant_id, FORMAT and samples
9468            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9469                self.get_header_sample_list()
9470            )
9471
9472            # Create dataframe
9473            dataframe_vaf_stats = self.get_query_to_df(
9474                f""" SELECT {samples_fields} FROM {table_variants} """
9475            )
9476
9477            # Create vaf_stats column
9478            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9479                lambda row: genotype_stats(
9480                    row, samples=self.get_header_sample_list(), info=info
9481                ),
9482                axis=1,
9483            )
9484
9485            # List of vcf tags
9486            sql_vaf_stats_fields = []
9487
9488            # Check all VAF stats infos
9489            for stat in vcf_infos_tags:
9490
9491                # Extract stats
9492                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9493                    lambda x: dict(x).get(stat, "")
9494                )
9495
9496                # Add snpeff_hgvs to header
9497                vcf_reader.infos[stat] = vcf.parser._Info(
9498                    stat,
9499                    ".",
9500                    "String",
9501                    vcf_infos_tags.get(stat, "genotype statistics"),
9502                    "howard calculation",
9503                    "0",
9504                    self.code_type_map.get("String"),
9505                )
9506
9507                if len(sql_vaf_stats_fields):
9508                    sep = ";"
9509                else:
9510                    sep = ""
9511
9512                # Create fields to add in INFO
9513                sql_vaf_stats_fields.append(
9514                    f"""
9515                        CASE
9516                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9517                            THEN concat(
9518                                    '{sep}{stat}=',
9519                                    dataframe_vaf_stats."{stat}"
9520                                )
9521                            ELSE ''
9522                        END
9523                    """
9524                )
9525
9526            # SQL set for update
9527            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9528
9529            # Update
9530            sql_update = f"""
9531                UPDATE {table_variants}
9532                SET "INFO" = 
9533                    concat(
9534                        CASE
9535                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9536                            THEN ''
9537                            ELSE concat("INFO", ';')
9538                        END,
9539                        {sql_vaf_stats_fields_set}
9540                    )
9541                FROM dataframe_vaf_stats
9542                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9543
9544            """
9545            self.conn.execute(sql_update)
9546
9547            # Remove added columns
9548            for added_column in added_columns:
9549                self.drop_column(column=added_column)
9550
9551            # Delete dataframe
9552            del dataframe_vaf_stats
9553            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
9555    def calculation_transcripts_annotation(
9556        self, info_json: str = None, info_format: str = None
9557    ) -> None:
9558        """
9559        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
9560        field to it if transcripts are available.
9561
9562        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
9563        is a string parameter that represents the information field to be used in the transcripts JSON.
9564        It is used to specify the JSON format for the transcripts information. If no value is provided
9565        when calling the method, it defaults to "
9566        :type info_json: str
9567        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
9568        method is a string parameter that specifies the format of the information field to be used in
9569        the transcripts JSON. It is used to define the format of the information field
9570        :type info_format: str
9571        """
9572
9573        # Create transcripts table
9574        transcripts_table = self.create_transcript_view()
9575
9576        # Add info field
9577        if transcripts_table:
9578            self.transcript_view_to_variants(
9579                transcripts_table=transcripts_table,
9580                transcripts_info_field_json=info_json,
9581                transcripts_info_field_format=info_format,
9582            )
9583        else:
9584            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
9586    def calculation_transcripts_prioritization(self) -> None:
9587        """
9588        The function `calculation_transcripts_prioritization` creates a transcripts table and
9589        prioritizes transcripts based on certain criteria.
9590        """
9591
9592        # Create transcripts table
9593        transcripts_table = self.create_transcript_view()
9594
9595        # Add info field
9596        if transcripts_table:
9597            self.transcripts_prioritization(transcripts_table=transcripts_table)
9598        else:
9599            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
9605    def transcripts_prioritization(
9606        self, transcripts_table: str = None, param: dict = {}
9607    ) -> bool:
9608        """
9609        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
9610        and updates the variants table with the prioritized information.
9611
9612        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
9613        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
9614        This parameter is used to identify the table where the transcripts data is stored for the
9615        prioritization process
9616        :type transcripts_table: str
9617        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
9618        that contains various configuration settings for the prioritization process of transcripts. It
9619        is used to customize the behavior of the prioritization algorithm and includes settings such as
9620        the prefix for prioritization fields, default profiles, and other
9621        :type param: dict
9622        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
9623        transcripts prioritization process is successfully completed, and `False` if there are any
9624        issues or if no profile is defined for transcripts prioritization.
9625        """
9626
9627        log.debug("Start transcripts prioritization...")
9628
9629        # Param
9630        if not param:
9631            param = self.get_param()
9632
9633        # Variants table
9634        table_variants = self.get_table_variants()
9635        log.debug(f"transcripts_table={transcripts_table}")
9636        # Transcripts table
9637        if transcripts_table is None:
9638            log.debug(f"transcripts_table={transcripts_table}")
9639            transcripts_table = self.create_transcript_view(
9640                transcripts_table="transcripts", param=param
9641            )
9642            log.debug(f"transcripts_table={transcripts_table}")
9643        if transcripts_table is None:
9644            msg_err = "No Transcripts table availalble"
9645            log.error(msg_err)
9646            raise ValueError(msg_err)
9647
9648        # Get transcripts columns
9649        columns_as_list_query = f"""
9650            DESCRIBE {transcripts_table}
9651        """
9652        columns_as_list = list(
9653            self.get_query_to_df(columns_as_list_query)["column_name"]
9654        )
9655
9656        # Create INFO if not exists
9657        if "INFO" not in columns_as_list:
9658            query_add_info = f"""
9659                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
9660            """
9661            self.execute_query(query_add_info)
9662
9663        # Prioritization param and Force only PZ Score and Flag
9664        pz_param = param.get("transcripts", {}).get("prioritization", {})
9665        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
9666        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
9667        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
9668        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
9669        pz_profile_default = (
9670            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
9671        )
9672
9673        # Exit if no profile
9674        if pz_profile_default is None:
9675            log.warning("No profile defined for transcripts prioritization")
9676            return False
9677
9678        # Prioritization
9679        prioritization_result = self.prioritization(
9680            table=transcripts_table,
9681            pz_param=param.get("transcripts", {}).get("prioritization", {}),
9682        )
9683        if not prioritization_result:
9684            log.warning("Transcripts prioritization not processed")
9685            return False
9686
9687        # Explode PZ fields
9688        self.explode_infos(
9689            table=transcripts_table,
9690            fields=param.get("transcripts", {})
9691            .get("prioritization", {})
9692            .get("pzfields", []),
9693        )
9694
9695        # Export Transcripts prioritization infos to variants table
9696        query_update = f"""
9697            WITH RankedTranscripts AS (
9698                SELECT
9699                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
9700                    ROW_NUMBER() OVER (
9701                        PARTITION BY "#CHROM", POS, REF, ALT
9702                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
9703                    ) AS rn
9704                FROM
9705                    {transcripts_table}
9706            )
9707            UPDATE {table_variants}
9708                SET
9709                INFO = CONCAT(CASE
9710                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9711                            THEN ''
9712                            ELSE concat("INFO", ';')
9713                        END,
9714                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
9715                        )
9716            FROM
9717                RankedTranscripts
9718            WHERE
9719                rn = 1
9720                AND variants."#CHROM" = RankedTranscripts."#CHROM"
9721                AND variants."POS" = RankedTranscripts."POS"
9722                AND variants."REF" = RankedTranscripts."REF"
9723                AND variants."ALT" = RankedTranscripts."ALT"
9724                
9725        """
9726        self.execute_query(query=query_update)
9727
9728        # Add PZ Transcript in header
9729        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
9730            pz_fields_transcripts,
9731            ".",
9732            "String",
9733            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
9734            "unknown",
9735            "unknown",
9736            code_type_map["String"],
9737        )
9738
9739        # Return
9740        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9742    def create_transcript_view_from_columns_map(
9743        self,
9744        transcripts_table: str = "transcripts",
9745        columns_maps: dict = {},
9746        added_columns: list = [],
9747        temporary_tables: list = None,
9748        annotation_fields: list = None,
9749    ) -> tuple[list, list, list]:
9750        """
9751        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9752        specified columns mapping for transcripts data.
9753
9754        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9755        the table where the transcripts data is stored or will be stored in the database. This table
9756        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9757        predictions, etc. It defaults to "transcripts, defaults to transcripts
9758        :type transcripts_table: str (optional)
9759        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9760        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9761        represents a mapping configuration for a specific set of columns. It typically includes details such
9762        as the main transcript column and additional information columns
9763        :type columns_maps: dict
9764        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9765        function is a list that stores the additional columns that will be added to the view being created
9766        based on the columns map provided. These columns are generated by exploding the transcript
9767        information columns along with the main transcript column
9768        :type added_columns: list
9769        :param temporary_tables: The `temporary_tables` parameter in the
9770        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9771        tables created during the process of creating a transcript view from a columns map. These temporary
9772        tables are used to store intermediate results or transformations before the final view is generated
9773        :type temporary_tables: list
9774        :param annotation_fields: The `annotation_fields` parameter in the
9775        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9776        for annotation in the query view creation process. These fields are extracted from the
9777        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9778        :type annotation_fields: list
9779        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9780        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9781        """
9782
9783        log.debug("Start transcrpts view creation from columns map...")
9784
9785        # "from_columns_map": [
9786        #     {
9787        #         "transcripts_column": "Ensembl_transcriptid",
9788        #         "transcripts_infos_columns": [
9789        #             "genename",
9790        #             "Ensembl_geneid",
9791        #             "LIST_S2_score",
9792        #             "LIST_S2_pred",
9793        #         ],
9794        #     },
9795        #     {
9796        #         "transcripts_column": "Ensembl_transcriptid",
9797        #         "transcripts_infos_columns": [
9798        #             "genename",
9799        #             "VARITY_R_score",
9800        #             "Aloft_pred",
9801        #         ],
9802        #     },
9803        # ],
9804
9805        # Init
9806        if temporary_tables is None:
9807            temporary_tables = []
9808        if annotation_fields is None:
9809            annotation_fields = []
9810
9811        # Variants table
9812        table_variants = self.get_table_variants()
9813
9814        for columns_map in columns_maps:
9815
9816            # Transcript column
9817            transcripts_column = columns_map.get("transcripts_column", None)
9818
9819            # Transcripts infos columns
9820            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9821
9822            if transcripts_column is not None:
9823
9824                # Explode
9825                added_columns += self.explode_infos(
9826                    fields=[transcripts_column] + transcripts_infos_columns
9827                )
9828
9829                # View clauses
9830                clause_select = []
9831                for field in [transcripts_column] + transcripts_infos_columns:
9832                    clause_select.append(
9833                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9834                    )
9835                    if field not in [transcripts_column]:
9836                        annotation_fields.append(field)
9837
9838                # Querey View
9839                query = f""" 
9840                    SELECT
9841                        "#CHROM", POS, REF, ALT, INFO,
9842                        "{transcripts_column}" AS 'transcript',
9843                        {", ".join(clause_select)}
9844                    FROM (
9845                        SELECT 
9846                            "#CHROM", POS, REF, ALT, INFO,
9847                            {", ".join(clause_select)}
9848                        FROM {table_variants}
9849                        )
9850                    WHERE "{transcripts_column}" IS NOT NULL
9851                """
9852
9853                # Create temporary table
9854                temporary_table = transcripts_table + "".join(
9855                    random.choices(string.ascii_uppercase + string.digits, k=10)
9856                )
9857
9858                # Temporary_tables
9859                temporary_tables.append(temporary_table)
9860                query_view = f"""
9861                    CREATE TEMPORARY TABLE {temporary_table}
9862                    AS ({query})
9863                """
9864                self.execute_query(query=query_view)
9865
9866        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
Returns

The function create_transcript_view_from_columns_map returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9868    def create_transcript_view_from_column_format(
9869        self,
9870        transcripts_table: str = "transcripts",
9871        column_formats: dict = {},
9872        temporary_tables: list = None,
9873        annotation_fields: list = None,
9874    ) -> tuple[list, list, list]:
9875        """
9876        The `create_transcript_view_from_column_format` function generates a transcript view based on
9877        specified column formats, adds additional columns and annotation fields, and returns the list of
9878        temporary tables and annotation fields.
9879
9880        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9881        the table containing the transcripts data. This table will be used as the base table for creating
9882        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9883        different table name if needed, defaults to transcripts
9884        :type transcripts_table: str (optional)
9885        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9886        about the columns to be used for creating the transcript view. Each entry in the dictionary
9887        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9888        the provided code snippet:
9889        :type column_formats: dict
9890        :param temporary_tables: The `temporary_tables` parameter in the
9891        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9892        views created during the process of creating a transcript view from a column format. These temporary
9893        views are used to manipulate and extract data before generating the final transcript view. It
9894        :type temporary_tables: list
9895        :param annotation_fields: The `annotation_fields` parameter in the
9896        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9897        that are extracted from the temporary views created during the process. These annotation fields are
9898        obtained by querying the temporary views and extracting the column names excluding specific columns
9899        like `#CH
9900        :type annotation_fields: list
9901        :return: The `create_transcript_view_from_column_format` function returns two lists:
9902        `temporary_tables` and `annotation_fields`.
9903        """
9904
9905        log.debug("Start transcrpts view creation from column format...")
9906
9907        #  "from_column_format": [
9908        #     {
9909        #         "transcripts_column": "ANN",
9910        #         "transcripts_infos_column": "Feature_ID",
9911        #     }
9912        # ],
9913
9914        # Init
9915        if temporary_tables is None:
9916            temporary_tables = []
9917        if annotation_fields is None:
9918            annotation_fields = []
9919
9920        for column_format in column_formats:
9921
9922            # annotation field and transcript annotation field
9923            annotation_field = column_format.get("transcripts_column", "ANN")
9924            transcript_annotation = column_format.get(
9925                "transcripts_infos_column", "Feature_ID"
9926            )
9927
9928            # Temporary View name
9929            temporary_view_name = transcripts_table + "".join(
9930                random.choices(string.ascii_uppercase + string.digits, k=10)
9931            )
9932
9933            # Create temporary view name
9934            temporary_view_name = self.annotation_format_to_table(
9935                uniquify=True,
9936                annotation_field=annotation_field,
9937                view_name=temporary_view_name,
9938                annotation_id=transcript_annotation,
9939            )
9940
9941            # Annotation fields
9942            if temporary_view_name:
9943                query_annotation_fields = f"""
9944                    SELECT *
9945                    FROM (
9946                        DESCRIBE SELECT *
9947                        FROM {temporary_view_name}
9948                        )
9949                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9950                """
9951                df_annotation_fields = self.get_query_to_df(
9952                    query=query_annotation_fields
9953                )
9954
9955                # Add temporary view and annotation fields
9956                temporary_tables.append(temporary_view_name)
9957                annotation_fields += list(set(df_annotation_fields["column_name"]))
9958
9959        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet:
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
 9961    def create_transcript_view(
 9962        self,
 9963        transcripts_table: str = None,
 9964        transcripts_table_drop: bool = True,
 9965        param: dict = {},
 9966    ) -> str:
 9967        """
 9968        The `create_transcript_view` function generates a transcript view by processing data from a
 9969        specified table based on provided parameters and structural information.
 9970
 9971        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9972        is used to specify the name of the table that will store the final transcript view data. If a table
 9973        name is not provided, the function will create a new table to store the transcript view data, and by
 9974        default,, defaults to transcripts
 9975        :type transcripts_table: str (optional)
 9976        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9977        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9978        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9979        the function will drop the existing transcripts table if it exists, defaults to True
 9980        :type transcripts_table_drop: bool (optional)
 9981        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9982        contains information needed to create a transcript view. It includes details such as the structure
 9983        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9984        the view. This parameter allows for flexibility and customization
 9985        :type param: dict
 9986        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9987        created or modified during the execution of the function.
 9988        """
 9989
 9990        log.debug("Start transcripts view creation...")
 9991
 9992        # Default
 9993        transcripts_table_default = "transcripts"
 9994
 9995        # Param
 9996        if not param:
 9997            param = self.get_param()
 9998
 9999        # Struct
10000        struct = param.get("transcripts", {}).get("struct", None)
10001
10002        if struct:
10003
10004            # Transcripts table
10005            if transcripts_table is None:
10006                transcripts_table = param.get("transcripts", {}).get(
10007                    "table", transcripts_table_default
10008                )
10009
10010            # added_columns
10011            added_columns = []
10012
10013            # Temporary tables
10014            temporary_tables = []
10015
10016            # Annotation fields
10017            annotation_fields = []
10018
10019            # from columns map
10020            columns_maps = struct.get("from_columns_map", [])
10021            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10022                self.create_transcript_view_from_columns_map(
10023                    transcripts_table=transcripts_table,
10024                    columns_maps=columns_maps,
10025                    added_columns=added_columns,
10026                    temporary_tables=temporary_tables,
10027                    annotation_fields=annotation_fields,
10028                )
10029            )
10030            added_columns += added_columns_tmp
10031            temporary_tables += temporary_tables_tmp
10032            annotation_fields += annotation_fields_tmp
10033
10034            # from column format
10035            column_formats = struct.get("from_column_format", [])
10036            temporary_tables_tmp, annotation_fields_tmp = (
10037                self.create_transcript_view_from_column_format(
10038                    transcripts_table=transcripts_table,
10039                    column_formats=column_formats,
10040                    temporary_tables=temporary_tables,
10041                    annotation_fields=annotation_fields,
10042                )
10043            )
10044            temporary_tables += temporary_tables_tmp
10045            annotation_fields += annotation_fields_tmp
10046
10047            # Merge temporary tables query
10048            query_merge = ""
10049            for temporary_table in temporary_tables:
10050
10051                # First temporary table
10052                if not query_merge:
10053                    query_merge = f"""
10054                        SELECT * FROM {temporary_table}
10055                    """
10056                # other temporary table (using UNION)
10057                else:
10058                    query_merge += f"""
10059                        UNION BY NAME SELECT * FROM {temporary_table}
10060                    """
10061
10062            # Merge on transcript
10063            query_merge_on_transcripts_annotation_fields = []
10064            # Aggregate all annotations fields
10065            for annotation_field in set(annotation_fields):
10066                query_merge_on_transcripts_annotation_fields.append(
10067                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
10068                )
10069            # Query for transcripts view
10070            query_merge_on_transcripts = f"""
10071                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
10072                FROM ({query_merge})
10073                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
10074            """
10075
10076            # Drop transcript view is necessary
10077            if transcripts_table_drop:
10078                query_drop = f"""
10079                    DROP TABLE IF EXISTS {transcripts_table};
10080                """
10081                self.execute_query(query=query_drop)
10082
10083            # Merge and create transcript view
10084            query_create_view = f"""
10085                CREATE TABLE IF NOT EXISTS {transcripts_table}
10086                AS {query_merge_on_transcripts}
10087            """
10088            self.execute_query(query=query_create_view)
10089
10090            # Remove added columns
10091            for added_column in added_columns:
10092                self.drop_column(column=added_column)
10093
10094        else:
10095
10096            transcripts_table = None
10097
10098        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts') -> str:
10100    def annotation_format_to_table(
10101        self,
10102        uniquify: bool = True,
10103        annotation_field: str = "ANN",
10104        annotation_id: str = "Feature_ID",
10105        view_name: str = "transcripts",
10106    ) -> str:
10107        """
10108        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
10109        table format.
10110
10111        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
10112        values in the output or not. If set to `True`, the function will make sure that the output values
10113        are unique, defaults to True
10114        :type uniquify: bool (optional)
10115        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10116        contains the annotation information for each variant. This field is used to extract the annotation
10117        details for further processing in the function, defaults to ANN
10118        :type annotation_field: str (optional)
10119        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10120        used to specify the identifier for the annotation feature. This identifier will be used as a column
10121        name in the resulting table or view that is created based on the annotation data. It helps in
10122        uniquely identifying each annotation entry in the, defaults to Feature_ID
10123        :type annotation_id: str (optional)
10124        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10125        specify the name of the temporary table that will be created to store the transformed annotation
10126        data. This table will hold the extracted information from the annotation field in a structured
10127        format for further processing or analysis, defaults to transcripts
10128        :type view_name: str (optional)
10129        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10130        is stored in the variable `view_name`.
10131        """
10132
10133        # Annotation field
10134        annotation_format = "annotation_explode"
10135
10136        # Transcript annotation
10137        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10138
10139        # Prefix
10140        prefix = self.get_explode_infos_prefix()
10141        if prefix:
10142            prefix = "INFO/"
10143
10144        # Annotation fields
10145        annotation_infos = prefix + annotation_field
10146        annotation_format_infos = prefix + annotation_format
10147
10148        # Variants table
10149        table_variants = self.get_table_variants()
10150
10151        # Header
10152        vcf_reader = self.get_header()
10153
10154        # Add columns
10155        added_columns = []
10156
10157        # Explode HGVS field in column
10158        added_columns += self.explode_infos(fields=[annotation_field])
10159
10160        if annotation_field in vcf_reader.infos:
10161
10162            # Extract ANN header
10163            ann_description = vcf_reader.infos[annotation_field].desc
10164            pattern = r"'(.+?)'"
10165            match = re.search(pattern, ann_description)
10166            if match:
10167                ann_header_match = match.group(1).split(" | ")
10168                ann_header = []
10169                ann_header_desc = {}
10170                for i in range(len(ann_header_match)):
10171                    ann_header_info = "".join(
10172                        char for char in ann_header_match[i] if char.isalnum()
10173                    )
10174                    ann_header.append(ann_header_info)
10175                    ann_header_desc[ann_header_info] = ann_header_match[i]
10176                if not ann_header_desc:
10177                    raise ValueError("Invalid header description format")
10178            else:
10179                raise ValueError("Invalid header description format")
10180
10181            # Create variant id
10182            variant_id_column = self.get_variant_id_column()
10183            added_columns += [variant_id_column]
10184
10185            # Create dataframe
10186            dataframe_annotation_format = self.get_query_to_df(
10187                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10188            )
10189
10190            # Create annotation columns
10191            dataframe_annotation_format[
10192                annotation_format_infos
10193            ] = dataframe_annotation_format[annotation_infos].apply(
10194                lambda x: explode_annotation_format(
10195                    annotation=str(x),
10196                    uniquify=uniquify,
10197                    output_format="JSON",
10198                    prefix="",
10199                    header=list(ann_header_desc.values()),
10200                )
10201            )
10202
10203            # Find keys
10204            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10205            df_keys = self.get_query_to_df(query=query_json)
10206
10207            # Check keys
10208            query_json_key = []
10209            for _, row in df_keys.iterrows():
10210
10211                # Key
10212                key = row.iloc[0]
10213
10214                # key_clean
10215                key_clean = "".join(char for char in key if char.isalnum())
10216
10217                # Type
10218                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10219
10220                # Get DataFrame from query
10221                df_json_type = self.get_query_to_df(query=query_json_type)
10222
10223                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10224                with pd.option_context("future.no_silent_downcasting", True):
10225                    df_json_type.fillna(value="", inplace=True)
10226                    replace_dict = {None: np.nan, "": np.nan}
10227                    df_json_type.replace(replace_dict, inplace=True)
10228                    df_json_type.dropna(inplace=True)
10229
10230                # Detect column type
10231                column_type = detect_column_type(df_json_type[key_clean])
10232
10233                # Append
10234                query_json_key.append(
10235                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10236                )
10237
10238            # Create view
10239            query_view = f"""
10240                CREATE TEMPORARY TABLE {view_name}
10241                AS (
10242                    SELECT *, {annotation_id} AS 'transcript'
10243                    FROM (
10244                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10245                        FROM dataframe_annotation_format
10246                        )
10247                    );
10248            """
10249            self.execute_query(query=query_view)
10250
10251        else:
10252
10253            # Return None
10254            view_name = None
10255
10256        # Remove added columns
10257        for added_column in added_columns:
10258            self.drop_column(column=added_column)
10259
10260        return view_name

The function annotation_format_to_table converts annotation data from a VCF file into a structured table format.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
10262    def transcript_view_to_variants(
10263        self,
10264        transcripts_table: str = None,
10265        transcripts_column_id: str = None,
10266        transcripts_info_json: str = None,
10267        transcripts_info_field_json: str = None,
10268        transcripts_info_format: str = None,
10269        transcripts_info_field_format: str = None,
10270        param: dict = {},
10271    ) -> bool:
10272        """
10273        The `transcript_view_to_variants` function updates a variants table with information from
10274        transcripts in JSON format.
10275
10276        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10277        table containing the transcripts data. If this parameter is not provided, the function will
10278        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10279        :type transcripts_table: str
10280        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10281        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10282        identifier is used to match transcripts with variants in the database
10283        :type transcripts_column_id: str
10284        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10285        of the column in the variants table where the transcripts information will be stored in JSON
10286        format. This parameter allows you to define the column in the variants table that will hold the
10287        JSON-formatted information about transcripts
10288        :type transcripts_info_json: str
10289        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10290        specify the field in the VCF header that will contain information about transcripts in JSON
10291        format. This field will be added to the VCF header as an INFO field with the specified name
10292        :type transcripts_info_field_json: str
10293        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10294        format of the information about transcripts that will be stored in the variants table. This
10295        format can be used to define how the transcript information will be structured or displayed
10296        within the variants table
10297        :type transcripts_info_format: str
10298        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10299        specify the field in the VCF header that will contain information about transcripts in a
10300        specific format. This field will be added to the VCF header as an INFO field with the specified
10301        name
10302        :type transcripts_info_field_format: str
10303        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10304        that contains various configuration settings related to transcripts. It is used to provide
10305        default values for certain parameters if they are not explicitly provided when calling the
10306        method. The `param` dictionary can be passed as an argument
10307        :type param: dict
10308        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10309        if the operation is successful and `False` if certain conditions are not met.
10310        """
10311
10312        msg_info_prefix = "Start transcripts view to variants annotations"
10313
10314        log.debug(f"{msg_info_prefix}...")
10315
10316        # Default
10317        transcripts_table_default = "transcripts"
10318        transcripts_column_id_default = "transcript"
10319        transcripts_info_json_default = None
10320        transcripts_info_format_default = None
10321        transcripts_info_field_json_default = None
10322        transcripts_info_field_format_default = None
10323
10324        # Param
10325        if not param:
10326            param = self.get_param()
10327
10328        # Transcripts table
10329        if transcripts_table is None:
10330            transcripts_table = param.get("transcripts", {}).get(
10331                "table", transcripts_table_default
10332            )
10333
10334        # Transcripts column ID
10335        if transcripts_column_id is None:
10336            transcripts_column_id = param.get("transcripts", {}).get(
10337                "column_id", transcripts_column_id_default
10338            )
10339
10340        # Transcripts info json
10341        if transcripts_info_json is None:
10342            transcripts_info_json = param.get("transcripts", {}).get(
10343                "transcripts_info_json", transcripts_info_json_default
10344            )
10345
10346        # Transcripts info field JSON
10347        if transcripts_info_field_json is None:
10348            transcripts_info_field_json = param.get("transcripts", {}).get(
10349                "transcripts_info_field_json", transcripts_info_field_json_default
10350            )
10351        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10352        #     transcripts_info_json = transcripts_info_field_json
10353
10354        # Transcripts info format
10355        if transcripts_info_format is None:
10356            transcripts_info_format = param.get("transcripts", {}).get(
10357                "transcripts_info_format", transcripts_info_format_default
10358            )
10359
10360        # Transcripts info field FORMAT
10361        if transcripts_info_field_format is None:
10362            transcripts_info_field_format = param.get("transcripts", {}).get(
10363                "transcripts_info_field_format", transcripts_info_field_format_default
10364            )
10365        # if (
10366        #     transcripts_info_field_format is not None
10367        #     and transcripts_info_format is None
10368        # ):
10369        #     transcripts_info_format = transcripts_info_field_format
10370
10371        # Variants table
10372        table_variants = self.get_table_variants()
10373
10374        # Check info columns param
10375        if (
10376            transcripts_info_json is None
10377            and transcripts_info_field_json is None
10378            and transcripts_info_format is None
10379            and transcripts_info_field_format is None
10380        ):
10381            return False
10382
10383        # Transcripts infos columns
10384        query_transcripts_infos_columns = f"""
10385            SELECT *
10386            FROM (
10387                DESCRIBE SELECT * FROM {transcripts_table}
10388                )
10389            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10390        """
10391        transcripts_infos_columns = list(
10392            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10393        )
10394
10395        # View results
10396        clause_select = []
10397        clause_to_json = []
10398        clause_to_format = []
10399        for field in transcripts_infos_columns:
10400            clause_select.append(
10401                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10402            )
10403            clause_to_json.append(f""" '{field}': "{field}" """)
10404            clause_to_format.append(f""" "{field}" """)
10405
10406        # Update
10407        update_set_json = []
10408        update_set_format = []
10409
10410        # VCF header
10411        vcf_reader = self.get_header()
10412
10413        # Transcripts to info column in JSON
10414        if transcripts_info_json is not None:
10415
10416            # Create column on variants table
10417            self.add_column(
10418                table_name=table_variants,
10419                column_name=transcripts_info_json,
10420                column_type="JSON",
10421                default_value=None,
10422                drop=False,
10423            )
10424
10425            # Add header
10426            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10427                transcripts_info_json,
10428                ".",
10429                "String",
10430                "Transcripts in JSON format",
10431                "unknwon",
10432                "unknwon",
10433                self.code_type_map["String"],
10434            )
10435
10436            # Add to update
10437            update_set_json.append(
10438                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10439            )
10440
10441        # Transcripts to info field in JSON
10442        if transcripts_info_field_json is not None:
10443
10444            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10445
10446            # Add to update
10447            update_set_json.append(
10448                f""" 
10449                    INFO = concat(
10450                            CASE
10451                                WHEN INFO NOT IN ('', '.')
10452                                THEN INFO
10453                                ELSE ''
10454                            END,
10455                            CASE
10456                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10457                                THEN concat(
10458                                    ';{transcripts_info_field_json}=',
10459                                    t.{transcripts_info_json}
10460                                )
10461                                ELSE ''
10462                            END
10463                            )
10464                """
10465            )
10466
10467            # Add header
10468            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10469                transcripts_info_field_json,
10470                ".",
10471                "String",
10472                "Transcripts in JSON format",
10473                "unknwon",
10474                "unknwon",
10475                self.code_type_map["String"],
10476            )
10477
10478        if update_set_json:
10479
10480            # Update query
10481            query_update = f"""
10482                UPDATE {table_variants}
10483                    SET {", ".join(update_set_json)}
10484                FROM
10485                (
10486                    SELECT
10487                        "#CHROM", POS, REF, ALT,
10488                            concat(
10489                            '{{',
10490                            string_agg(
10491                                '"' || "{transcripts_column_id}" || '":' ||
10492                                to_json(json_output)
10493                            ),
10494                            '}}'
10495                            )::JSON AS {transcripts_info_json}
10496                    FROM
10497                        (
10498                        SELECT
10499                            "#CHROM", POS, REF, ALT,
10500                            "{transcripts_column_id}",
10501                            to_json(
10502                                {{{",".join(clause_to_json)}}}
10503                            )::JSON AS json_output
10504                        FROM
10505                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10506                        WHERE "{transcripts_column_id}" IS NOT NULL
10507                        )
10508                    GROUP BY "#CHROM", POS, REF, ALT
10509                ) AS t
10510                WHERE {table_variants}."#CHROM" = t."#CHROM"
10511                    AND {table_variants}."POS" = t."POS"
10512                    AND {table_variants}."REF" = t."REF"
10513                    AND {table_variants}."ALT" = t."ALT"
10514            """
10515
10516            self.execute_query(query=query_update)
10517
10518        # Transcripts to info column in FORMAT
10519        if transcripts_info_format is not None:
10520
10521            # Create column on variants table
10522            self.add_column(
10523                table_name=table_variants,
10524                column_name=transcripts_info_format,
10525                column_type="VARCHAR",
10526                default_value=None,
10527                drop=False,
10528            )
10529
10530            # Add header
10531            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10532                transcripts_info_format,
10533                ".",
10534                "String",
10535                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10536                "unknwon",
10537                "unknwon",
10538                self.code_type_map["String"],
10539            )
10540
10541            # Add to update
10542            update_set_format.append(
10543                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10544            )
10545
10546        # Transcripts to info field in JSON
10547        if transcripts_info_field_format is not None:
10548
10549            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10550
10551            # Add to update
10552            update_set_format.append(
10553                f""" 
10554                    INFO = concat(
10555                            CASE
10556                                WHEN INFO NOT IN ('', '.')
10557                                THEN INFO
10558                                ELSE ''
10559                            END,
10560                            CASE
10561                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10562                                THEN concat(
10563                                    ';{transcripts_info_field_format}=',
10564                                    t.{transcripts_info_format}
10565                                )
10566                                ELSE ''
10567                            END
10568                            )
10569                """
10570            )
10571
10572            # Add header
10573            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10574                transcripts_info_field_format,
10575                ".",
10576                "String",
10577                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10578                "unknwon",
10579                "unknwon",
10580                self.code_type_map["String"],
10581            )
10582
10583        if update_set_format:
10584
10585            # Update query
10586            query_update = f"""
10587                UPDATE {table_variants}
10588                    SET {", ".join(update_set_format)}
10589                FROM
10590                (
10591                    SELECT
10592                        "#CHROM", POS, REF, ALT,
10593                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10594                    FROM 
10595                        (
10596                        SELECT
10597                            "#CHROM", POS, REF, ALT,
10598                            "{transcripts_column_id}",
10599                            concat(
10600                                "{transcripts_column_id}",
10601                                '|',
10602                                {", '|', ".join(clause_to_format)}
10603                            ) AS {transcripts_info_format}
10604                        FROM
10605                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10606                        )
10607                    GROUP BY "#CHROM", POS, REF, ALT
10608                ) AS t
10609                WHERE {table_variants}."#CHROM" = t."#CHROM"
10610                    AND {table_variants}."POS" = t."POS"
10611                    AND {table_variants}."REF" = t."REF"
10612                    AND {table_variants}."ALT" = t."ALT"
10613            """
10614
10615            self.execute_query(query=query_update)
10616
10617        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.